New upstream version 0.5
Steffen
2 years ago
0 | name: Tests | |
1 | ||
2 | on: | |
3 | push: | |
4 | pull_request: | |
5 | ||
6 | jobs: | |
7 | build: | |
8 | runs-on: ${{ matrix.os }} | |
9 | name: Python ${{ matrix.python-version }} on ${{ matrix.os }} | |
10 | strategy: | |
11 | matrix: | |
12 | os: [ubuntu-latest, macos-latest, windows-latest] | |
13 | python-version: ['3.7', '3.8', '3.9'] | |
14 | ||
15 | steps: | |
16 | - uses: actions/checkout@v2 | |
17 | ||
18 | - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} | |
19 | uses: actions/setup-python@v2 | |
20 | with: | |
21 | python-version: ${{ matrix.python-version }} | |
22 | ||
23 | - name: Install dependencies | |
24 | run: | | |
25 | pip3 install codecov pytest-cov || pip3 install --user codecov pytest-cov; | |
26 | ||
27 | - name: Run tests | |
28 | run: | | |
29 | pip3 install . | |
30 | coverage run --source=rdata/ --omit=rdata/tests/ setup.py test; | |
31 | ||
32 | - name: Upload coverage to Codecov | |
33 | uses: codecov/codecov-action@v1 |
0 | # Byte-compiled / optimized / DLL files | |
1 | __pycache__/ | |
2 | *.py[cod] | |
3 | *$py.class | |
4 | ||
5 | # C extensions | |
6 | *.so | |
7 | ||
8 | # Distribution / packaging | |
9 | .Python | |
10 | build/ | |
11 | develop-eggs/ | |
12 | dist/ | |
13 | downloads/ | |
14 | eggs/ | |
15 | .eggs/ | |
16 | lib/ | |
17 | lib64/ | |
18 | parts/ | |
19 | sdist/ | |
20 | var/ | |
21 | wheels/ | |
22 | *.egg-info/ | |
23 | .installed.cfg | |
24 | *.egg | |
25 | MANIFEST | |
26 | ||
27 | # PyInstaller | |
28 | # Usually these files are written by a python script from a template | |
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. | |
30 | *.manifest | |
31 | *.spec | |
32 | ||
33 | # Installer logs | |
34 | pip-log.txt | |
35 | pip-delete-this-directory.txt | |
36 | ||
37 | # Unit test / coverage reports | |
38 | htmlcov/ | |
39 | .tox/ | |
40 | .coverage | |
41 | .coverage.* | |
42 | .cache | |
43 | nosetests.xml | |
44 | coverage.xml | |
45 | *.cover | |
46 | .hypothesis/ | |
47 | .pytest_cache/ | |
48 | ||
49 | # Translations | |
50 | *.mo | |
51 | *.pot | |
52 | ||
53 | # Django stuff: | |
54 | *.log | |
55 | local_settings.py | |
56 | db.sqlite3 | |
57 | ||
58 | # Flask stuff: | |
59 | instance/ | |
60 | .webassets-cache | |
61 | ||
62 | # Scrapy stuff: | |
63 | .scrapy | |
64 | ||
65 | # Sphinx documentation | |
66 | docs/_build/ | |
67 | ||
68 | # PyBuilder | |
69 | target/ | |
70 | ||
71 | # Jupyter Notebook | |
72 | .ipynb_checkpoints | |
73 | ||
74 | # pyenv | |
75 | .python-version | |
76 | ||
77 | # celery beat schedule file | |
78 | celerybeat-schedule | |
79 | ||
80 | # SageMath parsed files | |
81 | *.sage.py | |
82 | ||
83 | # Environments | |
84 | .env | |
85 | .venv | |
86 | env/ | |
87 | venv/ | |
88 | ENV/ | |
89 | env.bak/ | |
90 | venv.bak/ | |
91 | ||
92 | # Spyder project settings | |
93 | .spyderproject | |
94 | .spyproject | |
95 | ||
96 | # Rope project settings | |
97 | .ropeproject | |
98 | ||
99 | # mkdocs documentation | |
100 | /site | |
101 | ||
102 | # mypy | |
103 | .mypy_cache/ |
0 | MIT License | |
1 | ||
2 | Copyright (c) 2018 Carlos Ramos Carreño | |
3 | ||
4 | Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 | of this software and associated documentation files (the "Software"), to deal | |
6 | in the Software without restriction, including without limitation the rights | |
7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 | copies of the Software, and to permit persons to whom the Software is | |
9 | furnished to do so, subject to the following conditions: | |
10 | ||
11 | The above copyright notice and this permission notice shall be included in all | |
12 | copies or substantial portions of the Software. | |
13 | ||
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
20 | SOFTWARE. |
0 | rdata | |
1 | ===== | |
2 | ||
3 | |build-status| |docs| |coverage| |landscape| |pypi| | |
4 | ||
5 | Read R datasets from Python. | |
6 | ||
7 | .. | |
8 | Github does not support include in README for dubious security reasons, so | |
9 | we copy-paste instead. Also Github does not understand Sphinx directives. | |
10 | .. include:: docs/simpleusage.rst | |
11 | ||
12 | Installation | |
13 | ============ | |
14 | ||
15 | rdata is on PyPi and can be installed using :code:`pip`: | |
16 | ||
17 | .. code:: | |
18 | ||
19 | pip install rdata | |
20 | ||
21 | It is also available for :code:`conda` using the :code:`conda-forge` channel: | |
22 | ||
23 | .. code:: | |
24 | ||
25 | conda install -c conda-forge rdata | |
26 | ||
27 | Documentation | |
28 | ============= | |
29 | ||
30 | The documentation of rdata is in | |
31 | `ReadTheDocs <https://rdata.readthedocs.io/en/latest/>`_. | |
32 | ||
33 | Simple usage | |
34 | ============ | |
35 | ||
36 | Read a R dataset | |
37 | ---------------- | |
38 | ||
39 | The common way of reading an R dataset is the following one: | |
40 | ||
41 | >>> import rdata | |
42 | ||
43 | >>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda") | |
44 | >>> converted = rdata.conversion.convert(parsed) | |
45 | >>> converted | |
46 | {'test_vector': array([1., 2., 3.])} | |
47 | ||
48 | This consists on two steps: | |
49 | ||
50 | #. First, the file is parsed using the function | |
51 | `parse_file`. This provides a literal description of the | |
52 | file contents as a hierarchy of Python objects representing the basic R | |
53 | objects. This step is unambiguous and always the same. | |
54 | #. Then, each object must be converted to an appropriate Python object. In this | |
55 | step there are several choices on which Python type is the most appropriate | |
56 | as the conversion for a given R object. Thus, we provide a default | |
57 | `convert` routine, which tries to select Python | |
58 | objects that preserve most information of the original R object. For custom | |
59 | R classes, it is also possible to specify conversion routines to Python | |
60 | objects. | |
61 | ||
62 | Convert custom R classes | |
63 | ------------------------ | |
64 | ||
65 | The basic `convert` routine only constructs a | |
66 | `SimpleConverter` objects and calls its | |
67 | `convert` method. All arguments of | |
68 | `convert` are directly passed to the | |
69 | `SimpleConverter` initialization method. | |
70 | ||
71 | It is possible, although not trivial, to make a custom | |
72 | `Converter` object to change the way in which the | |
73 | basic R objects are transformed to Python objects. However, a more common | |
74 | situation is that one does not want to change how basic R objects are | |
75 | converted, but instead wants to provide conversions for specific R classes. | |
76 | This can be done by passing a dictionary to the | |
77 | `SimpleConverter` initialization method, containing | |
78 | as keys the names of R classes and as values, callables that convert a | |
79 | R object of that class to a Python object. By default, the dictionary used | |
80 | is `DEFAULT_CLASS_MAP`, which can convert | |
81 | commonly used R classes such as `data.frame` and `factor`. | |
82 | ||
83 | As an example, here is how we would implement a conversion routine for the | |
84 | factor class to `bytes` objects, instead of the default conversion to | |
85 | Pandas `Categorical` objects: | |
86 | ||
87 | >>> import rdata | |
88 | ||
89 | >>> def factor_constructor(obj, attrs): | |
90 | ... values = [bytes(attrs['levels'][i - 1], 'utf8') | |
91 | ... if i >= 0 else None for i in obj] | |
92 | ... | |
93 | ... return values | |
94 | ||
95 | >>> new_dict = { | |
96 | ... **rdata.conversion.DEFAULT_CLASS_MAP, | |
97 | ... "factor": factor_constructor | |
98 | ... } | |
99 | ||
100 | >>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH | |
101 | ... / "test_dataframe.rda") | |
102 | >>> converted = rdata.conversion.convert(parsed, new_dict) | |
103 | >>> converted | |
104 | {'test_dataframe': class value | |
105 | 0 b'a' 1 | |
106 | 1 b'b' 2 | |
107 | 2 b'b' 3} | |
108 | ||
109 | ||
110 | .. |build-status| image:: https://github.com/vnmabus/rdata/actions/workflows/main.yml/badge.svg?branch=master | |
111 | :alt: build status | |
112 | :scale: 100% | |
113 | :target: https://github.com/vnmabus/rdata/actions/workflows/main.yml | |
114 | ||
115 | .. |docs| image:: https://readthedocs.org/projects/rdata/badge/?version=latest | |
116 | :alt: Documentation Status | |
117 | :scale: 100% | |
118 | :target: https://rdata.readthedocs.io/en/latest/?badge=latest | |
119 | ||
120 | .. |coverage| image:: http://codecov.io/github/vnmabus/rdata/coverage.svg?branch=develop | |
121 | :alt: Coverage Status | |
122 | :scale: 100% | |
123 | :target: https://codecov.io/gh/vnmabus/rdata/branch/develop | |
124 | ||
125 | .. |landscape| image:: https://landscape.io/github/vnmabus/rdata/develop/landscape.svg?style=flat | |
126 | :target: https://landscape.io/github/vnmabus/rdata/develop | |
127 | :alt: Code Health | |
128 | ||
129 | .. |pypi| image:: https://badge.fury.io/py/rdata.svg | |
130 | :alt: Pypi version | |
131 | :scale: 100% | |
132 | :target: https://pypi.python.org/pypi/rdata/⏎ |
0 | collect_ignore = ['setup.py'] |
0 | # Minimal makefile for Sphinx documentation | |
1 | # | |
2 | ||
3 | # You can set these variables from the command line. | |
4 | SPHINXOPTS = | |
5 | SPHINXBUILD = sphinx-build | |
6 | SPHINXPROJ = rdata | |
7 | SOURCEDIR = . | |
8 | BUILDDIR = _build | |
9 | ||
10 | # Put it first so that "make" without argument is like "make help". | |
11 | help: | |
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) | |
13 | ||
14 | .PHONY: help Makefile | |
15 | ||
16 | # Catch-all target: route all unknown targets to Sphinx using the new | |
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). | |
18 | %: Makefile | |
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)⏎ |
0 | {{ objname | escape | underline}} | |
1 | ||
2 | .. currentmodule:: {{ module }} | |
3 | ||
4 | .. auto{{ objtype }}:: {{ objname }}⏎ |
0 | {{ objname | escape | underline}} | |
1 | ||
2 | .. currentmodule:: {{ module }} | |
3 | ||
4 | .. autoclass:: {{ objname }} | |
5 | ||
6 | {% block methods %} | |
7 | {% if methods %} | |
8 | .. rubric:: Methods | |
9 | ||
10 | .. autosummary:: | |
11 | {% for item in methods %} | |
12 | ~{{ name }}.{{ item }} | |
13 | {%- endfor %} | |
14 | {% endif %} | |
15 | ||
16 | .. automethod:: __init__ | |
17 | {% endblock %} | |
18 | ||
19 | {% block attributes %} | |
20 | {% if attributes %} | |
21 | .. rubric:: Attributes | |
22 | ||
23 | .. autosummary:: | |
24 | {% for item in attributes %} | |
25 | ~{{ name }}.{{ item }} | |
26 | {%- endfor %} | |
27 | {% endif %} | |
28 | {% endblock %}⏎ |
0 | {{ objname | escape | underline}} | |
1 | ||
2 | .. automodule:: {{ fullname }} | |
3 | ||
4 | {% block attributes %} | |
5 | {% if attributes %} | |
6 | .. rubric:: {{ _('Module Attributes') }} | |
7 | ||
8 | .. autosummary:: | |
9 | :toctree: | |
10 | {% for item in attributes %} | |
11 | {{ item }} | |
12 | {%- endfor %} | |
13 | {% endif %} | |
14 | {% endblock %} | |
15 | ||
16 | {% block functions %} | |
17 | {% if functions %} | |
18 | .. rubric:: {{ _('Functions') }} | |
19 | ||
20 | .. autosummary:: | |
21 | :toctree: | |
22 | {% for item in functions %} | |
23 | {{ item }} | |
24 | {%- endfor %} | |
25 | {% endif %} | |
26 | {% endblock %} | |
27 | ||
28 | {% block classes %} | |
29 | {% if classes %} | |
30 | .. rubric:: {{ _('Classes') }} | |
31 | ||
32 | .. autosummary:: | |
33 | :toctree: | |
34 | {% for item in classes %} | |
35 | {{ item }} | |
36 | {%- endfor %} | |
37 | {% endif %} | |
38 | {% endblock %} | |
39 | ||
40 | {% block exceptions %} | |
41 | {% if exceptions %} | |
42 | .. rubric:: {{ _('Exceptions') }} | |
43 | ||
44 | .. autosummary:: | |
45 | :toctree: | |
46 | {% for item in exceptions %} | |
47 | {{ item }} | |
48 | {%- endfor %} | |
49 | {% endif %} | |
50 | {% endblock %} | |
51 | ||
52 | {% block modules %} | |
53 | {% if modules %} | |
54 | .. rubric:: Modules | |
55 | ||
56 | .. autosummary:: | |
57 | :toctree: | |
58 | :recursive: | |
59 | {% for item in modules %} | |
60 | {{ item }} | |
61 | {%- endfor %} | |
62 | {% endif %} | |
63 | {% endblock %}⏎ |
0 | API List | |
1 | ======== | |
2 | ||
3 | List of functions and structures | |
4 | -------------------------------- | |
5 | A complete list of all functions and structures provided by rdata. | |
6 | ||
7 | Parse :code:`.rda` format | |
8 | ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
9 | Functions for parsing data in the :code:`.rda` format. These functions return a structure representing | |
10 | the contents of the file, without transforming it to more appropiate Python objects. Thus, if a different | |
11 | way of converting R objects to Python objects is needed, it can be done from this structure. | |
12 | ||
13 | .. autosummary:: | |
14 | :toctree: modules | |
15 | ||
16 | rdata.parser.parse_file | |
17 | rdata.parser.parse_data | |
18 | ||
19 | Conversion of the R objects | |
20 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
21 | These objects and functions convert the parsed R objects to appropiate Python objects. The Python object | |
22 | corresponding to a R object is chosen to preserve most original properties, but it could change in the | |
23 | future, if a more fitting Python object is found. | |
24 | ||
25 | .. autosummary:: | |
26 | :toctree: modules | |
27 | ||
28 | rdata.conversion.Converter | |
29 | rdata.conversion.SimpleConverter | |
30 | rdata.conversion.convert | |
31 |
0 | #!/usr/bin/env python3 | |
1 | # -*- coding: utf-8 -*- | |
2 | # | |
3 | # dcor documentation build configuration file, created by | |
4 | # sphinx-quickstart on Tue Aug 7 12:49:32 2018. | |
5 | # | |
6 | # This file is execfile()d with the current directory set to its | |
7 | # containing dir. | |
8 | # | |
9 | # Note that not all possible configuration values are present in this | |
10 | # autogenerated file. | |
11 | # | |
12 | # All configuration values have a default; values that are commented out | |
13 | # serve to show the default. | |
14 | ||
15 | # If extensions (or modules to document with autodoc) are in another directory, | |
16 | # add these directories to sys.path here. If the directory is relative to the | |
17 | # documentation root, use os.path.abspath to make it absolute, like shown here. | |
18 | # | |
19 | # import os | |
20 | # import sys | |
21 | # sys.path.insert(0, '/home/carlos/git/rdata/rdata') | |
22 | ||
23 | import sys | |
24 | import pkg_resources | |
25 | try: | |
26 | release = pkg_resources.get_distribution('rdata').version | |
27 | except pkg_resources.DistributionNotFound: | |
28 | print('To build the documentation, The distribution information of rdata\n' | |
29 | 'Has to be available. Either install the package into your\n' | |
30 | 'development environment or run "setup.py develop" to setup the\n' | |
31 | 'metadata. A virtualenv is recommended!\n') | |
32 | sys.exit(1) | |
33 | del pkg_resources | |
34 | ||
35 | version = '.'.join(release.split('.')[:2]) | |
36 | ||
37 | # -- General configuration ------------------------------------------------ | |
38 | ||
39 | # If your documentation needs a minimal Sphinx version, state it here. | |
40 | # | |
41 | # needs_sphinx = '1.0' | |
42 | ||
43 | # Add any Sphinx extension module names here, as strings. They can be | |
44 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom | |
45 | # ones. | |
46 | extensions = ['sphinx.ext.autodoc', | |
47 | 'sphinx.ext.autosummary', | |
48 | 'sphinx.ext.todo', | |
49 | 'sphinx.ext.viewcode', | |
50 | 'sphinx.ext.napoleon', | |
51 | 'sphinx.ext.mathjax', | |
52 | 'sphinx.ext.intersphinx'] | |
53 | ||
54 | # Add any paths that contain templates here, relative to this directory. | |
55 | templates_path = ['_templates'] | |
56 | ||
57 | # The suffix(es) of source filenames. | |
58 | # You can specify multiple suffix as a list of string: | |
59 | # | |
60 | # source_suffix = ['.rst', '.md'] | |
61 | source_suffix = '.rst' | |
62 | ||
63 | # The master toctree document. | |
64 | master_doc = 'index' | |
65 | ||
66 | # General information about the project. | |
67 | project = 'rdata' | |
68 | copyright = '2018, Carlos Ramos Carreño' | |
69 | author = 'Carlos Ramos Carreño' | |
70 | ||
71 | # The version info for the project you're documenting, acts as replacement for | |
72 | # |version| and |release|, also used in various other places throughout the | |
73 | # built documents. | |
74 | # | |
75 | # The short X.Y version. | |
76 | # version = '' | |
77 | # The full version, including alpha/beta/rc tags. | |
78 | # release = '' | |
79 | ||
80 | # The language for content autogenerated by Sphinx. Refer to documentation | |
81 | # for a list of supported languages. | |
82 | # | |
83 | # This is also used if you do content translation via gettext catalogs. | |
84 | # Usually you set "language" from the command line for these cases. | |
85 | language = 'en' | |
86 | ||
87 | # List of patterns, relative to source directory, that match files and | |
88 | # directories to ignore when looking for source files. | |
89 | # This patterns also effect to html_static_path and html_extra_path | |
90 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] | |
91 | ||
92 | # The name of the Pygments (syntax highlighting) style to use. | |
93 | pygments_style = 'sphinx' | |
94 | ||
95 | # If true, `todo` and `todoList` produce output, else they produce nothing. | |
96 | todo_include_todos = True | |
97 | ||
98 | add_module_names = False | |
99 | ||
100 | autosummary_generate = True | |
101 | ||
102 | # -- Options for HTML output ---------------------------------------------- | |
103 | ||
104 | # The theme to use for HTML and HTML Help pages. See the documentation for | |
105 | # a list of builtin themes. | |
106 | # | |
107 | html_theme = 'sphinx_rtd_theme' | |
108 | ||
109 | # Theme options are theme-specific and customize the look and feel of a theme | |
110 | # further. For a list of options available for each theme, see the | |
111 | # documentation. | |
112 | # | |
113 | # html_theme_options = {} | |
114 | ||
115 | # Add any paths that contain custom static files (such as style sheets) here, | |
116 | # relative to this directory. They are copied after the builtin static files, | |
117 | # so a file named "default.css" will overwrite the builtin "default.css". | |
118 | html_static_path = ['_static'] | |
119 | ||
120 | # Custom sidebar templates, must be a dictionary that maps document names | |
121 | # to template names. | |
122 | # | |
123 | # This is required for the alabaster theme | |
124 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars | |
125 | html_sidebars = { | |
126 | '**': [ | |
127 | 'about.html', | |
128 | 'navigation.html', | |
129 | 'relations.html', # needs 'show_related': True theme option to display | |
130 | 'searchbox.html', | |
131 | 'donate.html', | |
132 | ] | |
133 | } | |
134 | ||
135 | # -- Options for HTMLHelp output ------------------------------------------ | |
136 | ||
137 | # Output file base name for HTML help builder. | |
138 | htmlhelp_basename = 'rdatadoc' | |
139 | ||
140 | # -- Options for LaTeX output --------------------------------------------- | |
141 | ||
142 | latex_elements = { | |
143 | # The paper size ('letterpaper' or 'a4paper'). | |
144 | # | |
145 | # 'papersize': 'letterpaper', | |
146 | ||
147 | # The font size ('10pt', '11pt' or '12pt'). | |
148 | # | |
149 | # 'pointsize': '10pt', | |
150 | ||
151 | # Additional stuff for the LaTeX preamble. | |
152 | # | |
153 | # 'preamble': '', | |
154 | ||
155 | # Latex figure (float) alignment | |
156 | # | |
157 | # 'figure_align': 'htbp', | |
158 | } | |
159 | ||
160 | # Grouping the document tree into LaTeX files. List of tuples | |
161 | # (source start file, target name, title, | |
162 | # author, documentclass [howto, manual, or own class]). | |
163 | latex_documents = [ | |
164 | (master_doc, 'rdata.tex', 'rdata Documentation', | |
165 | 'Carlos Ramos Carreño', 'manual'), | |
166 | ] | |
167 | ||
168 | # -- Options for manual page output --------------------------------------- | |
169 | ||
170 | # One entry per manual page. List of tuples | |
171 | # (source start file, name, description, authors, manual section). | |
172 | man_pages = [ | |
173 | (master_doc, 'rdata', 'rdata Documentation', | |
174 | [author], 1) | |
175 | ] | |
176 | ||
177 | # -- Options for Texinfo output ------------------------------------------- | |
178 | ||
179 | # Grouping the document tree into Texinfo files. List of tuples | |
180 | # (source start file, target name, title, author, | |
181 | # dir menu entry, description, category) | |
182 | texinfo_documents = [ | |
183 | (master_doc, 'rdata', 'rdata Documentation', | |
184 | author, 'rdata', 'One line description of project.', | |
185 | 'Miscellaneous'), | |
186 | ] | |
187 | ||
188 | # -- Options for Epub output ---------------------------------------------- | |
189 | ||
190 | # Bibliographic Dublin Core info. | |
191 | epub_title = project | |
192 | epub_author = author | |
193 | epub_publisher = author | |
194 | epub_copyright = copyright | |
195 | ||
196 | # The unique identifier of the text. This can be a ISBN number | |
197 | # or the project homepage. | |
198 | # | |
199 | # epub_identifier = '' | |
200 | ||
201 | # A unique identification for the text. | |
202 | # | |
203 | # epub_uid = '' | |
204 | ||
205 | # A list of files that should not be packed into the epub file. | |
206 | epub_exclude_files = ['search.html'] | |
207 | ||
208 | intersphinx_mapping = {'python': ('https://docs.python.org/3', None), | |
209 | 'pandas': ('http://pandas.pydata.org/pandas-docs/dev', None)} |
0 | rdata version |version| | |
1 | ======================= | |
2 | ||
3 | |build-status| |docs| |coverage| |landscape| |pypi| | |
4 | ||
5 | Open :code:`.rda` R data files containing datasets and convert them to the appropiate Python objects. | |
6 | ||
7 | .. toctree:: | |
8 | :maxdepth: 4 | |
9 | :caption: Contents: | |
10 | ||
11 | installation | |
12 | simpleusage | |
13 | apilist | |
14 | internalapi | |
15 | ||
16 | rdata is developed `on Github <http://github.com/vnmabus/rdata>`_. Please | |
17 | report `issues <https://github.com/vnmabus/rdata/issues>`_ there as well. | |
18 | ||
19 | Indices and tables | |
20 | ================== | |
21 | ||
22 | * :ref:`genindex` | |
23 | * :ref:`modindex` | |
24 | * :ref:`search` | |
25 | ||
26 | .. |build-status| image:: https://api.travis-ci.org/vnmabus/rdata.svg?branch=master | |
27 | :alt: build status | |
28 | :scale: 100% | |
29 | :target: https://travis-ci.org/vnmabus/rdata | |
30 | ||
31 | .. |docs| image:: https://readthedocs.org/projects/rdata/badge/?version=latest | |
32 | :alt: Documentation Status | |
33 | :scale: 100% | |
34 | :target: https://rdata.readthedocs.io/en/latest/?badge=latest | |
35 | ||
36 | .. |coverage| image:: http://codecov.io/github/vnmabus/rdata/coverage.svg?branch=develop | |
37 | :alt: Coverage Status | |
38 | :scale: 100% | |
39 | :target: https://codecov.io/gh/vnmabus/rdata/branch/develop | |
40 | ||
41 | .. |landscape| image:: https://landscape.io/github/vnmabus/rdata/develop/landscape.svg?style=flat | |
42 | :target: https://landscape.io/github/vnmabus/rdata/develop | |
43 | :alt: Code Health | |
44 | ||
45 | .. |pypi| image:: https://badge.fury.io/py/rdata.svg | |
46 | :alt: Pypi version | |
47 | :scale: 100% | |
48 | :target: https://pypi.python.org/pypi/rdata/ |
0 | Installation | |
1 | ============ | |
2 | ||
3 | rdata is on PyPi and can be installed using :code:`pip`: | |
4 | ||
5 | .. code:: | |
6 | ||
7 | pip install rdata | |
8 | ||
9 | It is also available for :code:`conda` using the :code:`conda-forge` channel: | |
10 | ||
11 | .. code:: | |
12 | ||
13 | conda install -c conda-forge rdata |
0 | Internal documentation | |
1 | ====================== | |
2 | ||
3 | List of modules | |
4 | --------------- | |
5 | .. autosummary:: | |
6 | :toctree: modules | |
7 | :recursive: | |
8 | ||
9 | rdata.parser._parser | |
10 | rdata.conversion._conversion⏎ |
0 | @ECHO OFF | |
1 | ||
2 | pushd %~dp0 | |
3 | ||
4 | REM Command file for Sphinx documentation | |
5 | ||
6 | if "%SPHINXBUILD%" == "" ( | |
7 | set SPHINXBUILD=sphinx-build | |
8 | ) | |
9 | set SOURCEDIR=. | |
10 | set BUILDDIR=_build | |
11 | set SPHINXPROJ=rdata | |
12 | ||
13 | if "%1" == "" goto help | |
14 | ||
15 | %SPHINXBUILD% >NUL 2>NUL | |
16 | if errorlevel 9009 ( | |
17 | echo. | |
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx | |
19 | echo.installed, then set the SPHINXBUILD environment variable to point | |
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you | |
21 | echo.may add the Sphinx directory to PATH. | |
22 | echo. | |
23 | echo.If you don't have Sphinx installed, grab it from | |
24 | echo.http://sphinx-doc.org/ | |
25 | exit /b 1 | |
26 | ) | |
27 | ||
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% | |
29 | goto end | |
30 | ||
31 | :help | |
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% | |
33 | ||
34 | :end | |
35 | popd |
0 | Simple usage | |
1 | ============ | |
2 | ||
3 | Read a R dataset | |
4 | ---------------- | |
5 | ||
6 | The common way of reading an R dataset is the following one: | |
7 | ||
8 | >>> import rdata | |
9 | ||
10 | >>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda") | |
11 | >>> converted = rdata.conversion.convert(parsed) | |
12 | >>> converted | |
13 | {'test_vector': array([1., 2., 3.])} | |
14 | ||
15 | This consists on two steps: | |
16 | ||
17 | #. First, the file is parsed using the function | |
18 | :func:`~rdata.parser.parse_file`. This provides a literal description of the | |
19 | file contents as a hierarchy of Python objects representing the basic R | |
20 | objects. This step is unambiguous and always the same. | |
21 | #. Then, each object must be converted to an appropriate Python object. In this | |
22 | step there are several choices on which Python type is the most appropriate | |
23 | as the conversion for a given R object. Thus, we provide a default | |
24 | :func:`~rdata.conversion.convert` routine, which tries to select Python | |
25 | objects that preserve most information of the original R object. For custom | |
26 | R classes, it is also possible to specify conversion routines to Python | |
27 | objects. | |
28 | ||
29 | Convert custom R classes | |
30 | ------------------------ | |
31 | ||
32 | The basic :func:`~rdata.conversion.convert` routine only constructs a | |
33 | :class:`~rdata.conversion.SimpleConverter` objects and calls its | |
34 | :func:`~rdata.conversion.SimpleConverter.convert` method. All arguments of | |
35 | :func:`~rdata.conversion.convert` are directly passed to the | |
36 | :class:`~rdata.conversion.SimpleConverter` initialization method. | |
37 | ||
38 | It is possible, although not trivial, to make a custom | |
39 | :class:`~rdata.conversion.Converter` object to change the way in which the | |
40 | basic R objects are transformed to Python objects. However, a more common | |
41 | situation is that one does not want to change how basic R objects are | |
42 | converted, but instead wants to provide conversions for specific R classes. | |
43 | This can be done by passing a dictionary to the | |
44 | :class:`~rdata.conversion.SimpleConverter` initialization method, containing | |
45 | as keys the names of R classes and as values, callables that convert a | |
46 | R object of that class to a Python object. By default, the dictionary used | |
47 | is :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`, which can convert | |
48 | commonly used R classes such as `data.frame` and `factor`. | |
49 | ||
50 | As an example, here is how we would implement a conversion routine for the | |
51 | factor class to :class:`bytes` objects, instead of the default conversion to | |
52 | Pandas :class:`~pandas.Categorical` objects: | |
53 | ||
54 | >>> import rdata | |
55 | ||
56 | >>> def factor_constructor(obj, attrs): | |
57 | ... values = [bytes(attrs['levels'][i - 1], 'utf8') | |
58 | ... if i >= 0 else None for i in obj] | |
59 | ... | |
60 | ... return values | |
61 | ||
62 | >>> new_dict = { | |
63 | ... **rdata.conversion.DEFAULT_CLASS_MAP, | |
64 | ... "factor": factor_constructor | |
65 | ... } | |
66 | ||
67 | >>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH | |
68 | ... / "test_dataframe.rda") | |
69 | >>> converted = rdata.conversion.convert(parsed, new_dict) | |
70 | >>> converted | |
71 | {'test_dataframe': class value | |
72 | 0 b'a' 1 | |
73 | 1 b'b' 2 | |
74 | 2 b'b' 3} |
0 | [build-system] | |
1 | # Minimum requirements for the build system to execute. | |
2 | requires = ["setuptools"]⏎ |
0 | import os as _os | |
1 | import pathlib as _pathlib | |
2 | ||
3 | from . import conversion, parser | |
4 | ||
5 | ||
6 | def _get_test_data_path() -> _pathlib.Path: | |
7 | return _pathlib.Path(_os.path.dirname(__file__)) / "tests" / "data" | |
8 | ||
9 | ||
10 | TESTDATA_PATH = _get_test_data_path() | |
11 | """ | |
12 | Path of the test data. | |
13 | ||
14 | """ |
0 | from ._conversion import (RExpression, RLanguage, | |
1 | convert_list, convert_attrs, convert_vector, | |
2 | convert_char, convert_symbol, convert_array, | |
3 | Converter, SimpleConverter, | |
4 | dataframe_constructor, | |
5 | factor_constructor, | |
6 | ts_constructor, | |
7 | DEFAULT_CLASS_MAP, convert) |
0 | import abc | |
1 | import warnings | |
2 | from fractions import Fraction | |
3 | from types import MappingProxyType, SimpleNamespace | |
4 | from typing import ( | |
5 | Any, | |
6 | Callable, | |
7 | ChainMap, | |
8 | Hashable, | |
9 | List, | |
10 | Mapping, | |
11 | MutableMapping, | |
12 | NamedTuple, | |
13 | Optional, | |
14 | Union, | |
15 | cast, | |
16 | ) | |
17 | ||
18 | import numpy as np | |
19 | import pandas | |
20 | import xarray | |
21 | ||
22 | from .. import parser | |
23 | from ..parser import RObject | |
24 | ||
25 | ||
26 | class RLanguage(NamedTuple): | |
27 | """ | |
28 | R language construct. | |
29 | """ | |
30 | elements: List[Any] | |
31 | ||
32 | ||
33 | class RExpression(NamedTuple): | |
34 | """ | |
35 | R expression. | |
36 | """ | |
37 | elements: List[RLanguage] | |
38 | ||
39 | ||
40 | def convert_list( | |
41 | r_list: parser.RObject, | |
42 | conversion_function: Callable[ | |
43 | [Union[parser.RData, parser.RObject] | |
44 | ], Any]=lambda x: x | |
45 | ) -> Union[Mapping[Union[str, bytes], Any], List[Any]]: | |
46 | """ | |
47 | Expand a tagged R pairlist to a Python dictionary. | |
48 | ||
49 | Parameters | |
50 | ---------- | |
51 | r_list: RObject | |
52 | Pairlist R object, with tags. | |
53 | conversion_function: Callable | |
54 | Conversion function to apply to the elements of the list. By default | |
55 | is the identity function. | |
56 | ||
57 | Returns | |
58 | ------- | |
59 | dictionary: dict | |
60 | A dictionary with the tags of the pairwise list as keys and their | |
61 | corresponding values as values. | |
62 | ||
63 | See Also | |
64 | -------- | |
65 | convert_vector | |
66 | ||
67 | """ | |
68 | if r_list.info.type is parser.RObjectType.NILVALUE: | |
69 | return {} | |
70 | elif r_list.info.type not in [parser.RObjectType.LIST, | |
71 | parser.RObjectType.LANG]: | |
72 | raise TypeError("Must receive a LIST, LANG or NILVALUE object") | |
73 | ||
74 | if r_list.tag is None: | |
75 | tag = None | |
76 | else: | |
77 | tag = conversion_function(r_list.tag) | |
78 | ||
79 | cdr = conversion_function(r_list.value[1]) | |
80 | ||
81 | if tag is not None: | |
82 | if cdr is None: | |
83 | cdr = {} | |
84 | ||
85 | return {tag: conversion_function(r_list.value[0]), **cdr} | |
86 | else: | |
87 | if cdr is None: | |
88 | cdr = [] | |
89 | ||
90 | return [conversion_function(r_list.value[0]), *cdr] | |
91 | ||
92 | ||
93 | def convert_env( | |
94 | r_env: parser.RObject, | |
95 | conversion_function: Callable[ | |
96 | [Union[parser.RData, parser.RObject] | |
97 | ], Any]=lambda x: x | |
98 | ) -> ChainMap[Union[str, bytes], Any]: | |
99 | ||
100 | if r_env.info.type is not parser.RObjectType.ENV: | |
101 | raise TypeError("Must receive a ENV object") | |
102 | ||
103 | frame = conversion_function(r_env.value.frame) | |
104 | enclosure = conversion_function(r_env.value.enclosure) | |
105 | hash_table = conversion_function(r_env.value.hash_table) | |
106 | ||
107 | dictionary = {} | |
108 | for d in hash_table: | |
109 | if d is not None: | |
110 | dictionary.update(d) | |
111 | ||
112 | return ChainMap(dictionary, enclosure) | |
113 | ||
114 | ||
115 | def convert_attrs( | |
116 | r_obj: parser.RObject, | |
117 | conversion_function: Callable[ | |
118 | [Union[parser.RData, parser.RObject] | |
119 | ], Any]=lambda x: x | |
120 | ) -> Mapping[Union[str, bytes], Any]: | |
121 | """ | |
122 | Return the attributes of an object as a Python dictionary. | |
123 | ||
124 | Parameters | |
125 | ---------- | |
126 | r_obj: RObject | |
127 | R object. | |
128 | conversion_function: Callable | |
129 | Conversion function to apply to the elements of the attribute list. By | |
130 | default is the identity function. | |
131 | ||
132 | Returns | |
133 | ------- | |
134 | dictionary: dict | |
135 | A dictionary with the names of the attributes as keys and their | |
136 | corresponding values as values. | |
137 | ||
138 | See Also | |
139 | -------- | |
140 | convert_list | |
141 | ||
142 | """ | |
143 | if r_obj.attributes: | |
144 | attrs = cast( | |
145 | Mapping[Union[str, bytes], Any], | |
146 | conversion_function(r_obj.attributes), | |
147 | ) | |
148 | else: | |
149 | attrs = {} | |
150 | return attrs | |
151 | ||
152 | ||
153 | def convert_vector( | |
154 | r_vec: parser.RObject, | |
155 | conversion_function: Callable[ | |
156 | [Union[parser.RData, parser.RObject]], Any]=lambda x: x, | |
157 | attrs: Optional[Mapping[Union[str, bytes], Any]] = None, | |
158 | ) -> Union[List[Any], Mapping[Union[str, bytes], Any]]: | |
159 | """ | |
160 | Convert a R vector to a Python list or dictionary. | |
161 | ||
162 | If the vector has a ``names`` attribute, the result is a dictionary with | |
163 | the names as keys. Otherwise, the result is a Python list. | |
164 | ||
165 | Parameters | |
166 | ---------- | |
167 | r_vec: RObject | |
168 | R vector. | |
169 | conversion_function: Callable | |
170 | Conversion function to apply to the elements of the vector. By default | |
171 | is the identity function. | |
172 | ||
173 | Returns | |
174 | ------- | |
175 | vector: dict or list | |
176 | A dictionary with the ``names`` of the vector as keys and their | |
177 | corresponding values as values. If the vector does not have an argument | |
178 | ``names``, then a normal Python list is returned. | |
179 | ||
180 | See Also | |
181 | -------- | |
182 | convert_list | |
183 | ||
184 | """ | |
185 | if attrs is None: | |
186 | attrs = {} | |
187 | ||
188 | if r_vec.info.type not in [parser.RObjectType.VEC, | |
189 | parser.RObjectType.EXPR]: | |
190 | raise TypeError("Must receive a VEC or EXPR object") | |
191 | ||
192 | value: Union[List[Any], Mapping[Union[str, bytes], Any]] = [ | |
193 | conversion_function(o) for o in r_vec.value | |
194 | ] | |
195 | ||
196 | # If it has the name attribute, use a dict instead | |
197 | field_names = attrs.get('names') | |
198 | if field_names: | |
199 | value = dict(zip(field_names, value)) | |
200 | ||
201 | return value | |
202 | ||
203 | ||
204 | def safe_decode(byte_str: bytes, encoding: str) -> Union[str, bytes]: | |
205 | """ | |
206 | Decode a (possibly malformed) string. | |
207 | """ | |
208 | try: | |
209 | return byte_str.decode(encoding) | |
210 | except UnicodeDecodeError as e: | |
211 | warnings.warn( | |
212 | f"Exception while decoding {byte_str!r}: {e}", | |
213 | ) | |
214 | return byte_str | |
215 | ||
216 | ||
217 | def convert_char( | |
218 | r_char: parser.RObject, | |
219 | default_encoding: Optional[str] = None, | |
220 | force_default_encoding: bool = False, | |
221 | ) -> Union[str, bytes, None]: | |
222 | """ | |
223 | Decode a R character array to a Python string or bytes. | |
224 | ||
225 | The bits that signal the encoding are in the general pointer. The | |
226 | string can be encoded in UTF8, LATIN1 or ASCII, or can be a sequence | |
227 | of bytes. | |
228 | ||
229 | Parameters | |
230 | ---------- | |
231 | r_char: RObject | |
232 | R character array. | |
233 | ||
234 | Returns | |
235 | ------- | |
236 | string: str or bytes | |
237 | Decoded string. | |
238 | ||
239 | See Also | |
240 | -------- | |
241 | convert_symbol | |
242 | ||
243 | """ | |
244 | if r_char.info.type is not parser.RObjectType.CHAR: | |
245 | raise TypeError("Must receive a CHAR object") | |
246 | ||
247 | if r_char.value is None: | |
248 | return None | |
249 | ||
250 | assert isinstance(r_char.value, bytes) | |
251 | ||
252 | if not force_default_encoding: | |
253 | if r_char.info.gp & parser.CharFlags.UTF8: | |
254 | return safe_decode(r_char.value, "utf_8") | |
255 | elif r_char.info.gp & parser.CharFlags.LATIN1: | |
256 | return safe_decode(r_char.value, "latin_1") | |
257 | elif r_char.info.gp & parser.CharFlags.ASCII: | |
258 | return safe_decode(r_char.value, "ascii") | |
259 | elif r_char.info.gp & parser.CharFlags.BYTES: | |
260 | return r_char.value | |
261 | ||
262 | if default_encoding: | |
263 | return safe_decode(r_char.value, default_encoding) | |
264 | else: | |
265 | # Assume ASCII if no encoding is marked | |
266 | warnings.warn(f"Unknown encoding. Assumed ASCII.") | |
267 | return safe_decode(r_char.value, "ascii") | |
268 | ||
269 | ||
270 | def convert_symbol(r_symbol: parser.RObject, | |
271 | conversion_function: Callable[ | |
272 | [Union[parser.RData, parser.RObject]], | |
273 | Any]=lambda x: x | |
274 | ) -> Union[str, bytes]: | |
275 | """ | |
276 | Decode a R symbol to a Python string or bytes. | |
277 | ||
278 | Parameters | |
279 | ---------- | |
280 | r_symbol: RObject | |
281 | R symbol. | |
282 | conversion_function: Callable | |
283 | Conversion function to apply to the char element of the symbol. | |
284 | By default is the identity function. | |
285 | ||
286 | Returns | |
287 | ------- | |
288 | string: str or bytes | |
289 | Decoded string. | |
290 | ||
291 | See Also | |
292 | -------- | |
293 | convert_char | |
294 | ||
295 | """ | |
296 | if r_symbol.info.type is parser.RObjectType.SYM: | |
297 | symbol = conversion_function(r_symbol.value) | |
298 | assert isinstance(symbol, (str, bytes)) | |
299 | return symbol | |
300 | else: | |
301 | raise TypeError("Must receive a SYM object") | |
302 | ||
303 | ||
304 | def convert_array( | |
305 | r_array: RObject, | |
306 | conversion_function: Callable[ | |
307 | [Union[parser.RData, parser.RObject] | |
308 | ], Any]=lambda x: x, | |
309 | attrs: Optional[Mapping[Union[str, bytes], Any]] = None, | |
310 | ) -> Union[np.ndarray, xarray.DataArray]: | |
311 | """ | |
312 | Convert a R array to a Numpy ndarray or a Xarray DataArray. | |
313 | ||
314 | If the array has attribute ``dimnames`` the output will be a | |
315 | Xarray DataArray, preserving the dimension names. | |
316 | ||
317 | Parameters | |
318 | ---------- | |
319 | r_array: RObject | |
320 | R array. | |
321 | conversion_function: Callable | |
322 | Conversion function to apply to the attributes of the array. | |
323 | By default is the identity function. | |
324 | ||
325 | Returns | |
326 | ------- | |
327 | array: ndarray or DataArray | |
328 | Array. | |
329 | ||
330 | See Also | |
331 | -------- | |
332 | convert_vector | |
333 | ||
334 | """ | |
335 | if attrs is None: | |
336 | attrs = {} | |
337 | ||
338 | if r_array.info.type not in {parser.RObjectType.LGL, | |
339 | parser.RObjectType.INT, | |
340 | parser.RObjectType.REAL, | |
341 | parser.RObjectType.CPLX}: | |
342 | raise TypeError("Must receive an array object") | |
343 | ||
344 | value = r_array.value | |
345 | ||
346 | shape = attrs.get('dim') | |
347 | if shape is not None: | |
348 | # R matrix order is like FORTRAN | |
349 | value = np.reshape(value, shape, order='F') | |
350 | ||
351 | dimnames = attrs.get('dimnames') | |
352 | if dimnames: | |
353 | dimension_names = ["dim_" + str(i) for i, _ in enumerate(dimnames)] | |
354 | coords: Mapping[Hashable, Any] = { | |
355 | dimension_names[i]: d | |
356 | for i, d in enumerate(dimnames) if d is not None} | |
357 | ||
358 | value = xarray.DataArray(value, dims=dimension_names, coords=coords) | |
359 | ||
360 | return value | |
361 | ||
362 | ||
363 | def dataframe_constructor( | |
364 | obj: Any, | |
365 | attrs: Mapping[Union[str, bytes], Any], | |
366 | ) -> pandas.DataFrame: | |
367 | return pandas.DataFrame(obj, columns=obj) | |
368 | ||
369 | ||
370 | def _factor_constructor_internal( | |
371 | obj: Any, | |
372 | attrs: Mapping[Union[str, bytes], Any], | |
373 | ordered: bool, | |
374 | ) -> pandas.Categorical: | |
375 | values = [attrs['levels'][i - 1] if i >= 0 else None for i in obj] | |
376 | ||
377 | return pandas.Categorical(values, attrs['levels'], ordered=ordered) | |
378 | ||
379 | ||
380 | def factor_constructor( | |
381 | obj: Any, | |
382 | attrs: Mapping[Union[str, bytes], Any], | |
383 | ) -> pandas.Categorical: | |
384 | return _factor_constructor_internal(obj, attrs, ordered=False) | |
385 | ||
386 | ||
387 | def ordered_constructor( | |
388 | obj: Any, | |
389 | attrs: Mapping[Union[str, bytes], Any], | |
390 | ) -> pandas.Categorical: | |
391 | return _factor_constructor_internal(obj, attrs, ordered=True) | |
392 | ||
393 | ||
394 | def ts_constructor( | |
395 | obj: Any, | |
396 | attrs: Mapping[Union[str, bytes], Any], | |
397 | ) -> pandas.Series: | |
398 | ||
399 | start, end, frequency = attrs['tsp'] | |
400 | ||
401 | frequency = int(frequency) | |
402 | ||
403 | real_start = Fraction(int(round(start * frequency)), frequency) | |
404 | real_end = Fraction(int(round(end * frequency)), frequency) | |
405 | ||
406 | index = np.arange(real_start, real_end + Fraction(1, frequency), | |
407 | Fraction(1, frequency)) | |
408 | ||
409 | if frequency == 1: | |
410 | index = index.astype(int) | |
411 | ||
412 | return pandas.Series(obj, index=index) | |
413 | ||
414 | ||
415 | Constructor = Callable[[Any, Mapping], Any] | |
416 | ||
417 | default_class_map_dict: Mapping[Union[str, bytes], Constructor] = { | |
418 | "data.frame": dataframe_constructor, | |
419 | "factor": factor_constructor, | |
420 | "ordered": ordered_constructor, | |
421 | "ts": ts_constructor, | |
422 | } | |
423 | ||
424 | DEFAULT_CLASS_MAP = MappingProxyType(default_class_map_dict) | |
425 | """ | |
426 | Default mapping of constructor functions. | |
427 | ||
428 | It has support for converting several commonly used R classes: | |
429 | ||
430 | - Converts R \"data.frame\" objects into Pandas :class:`~pandas.DataFrame` | |
431 | objects. | |
432 | - Converts R \"factor\" objects into unordered Pandas | |
433 | :class:`~pandas.Categorical` objects. | |
434 | - Converts R \"ordered\" objects into ordered Pandas | |
435 | :class:`~pandas.Categorical` objects. | |
436 | - Converts R \"ts\" objects into Pandas :class:`~pandas.Series` objects. | |
437 | ||
438 | """ | |
439 | ||
440 | ||
441 | class Converter(abc.ABC): | |
442 | """ | |
443 | Interface of a class converting R objects in Python objects. | |
444 | """ | |
445 | ||
446 | @abc.abstractmethod | |
447 | def convert(self, data: Union[parser.RData, parser.RObject]) -> Any: | |
448 | """ | |
449 | Convert a R object to a Python one. | |
450 | """ | |
451 | pass | |
452 | ||
453 | ||
454 | class SimpleConverter(Converter): | |
455 | """ | |
456 | Class converting R objects to Python objects. | |
457 | ||
458 | Parameters | |
459 | ---------- | |
460 | constructor_dict: | |
461 | Dictionary mapping names of R classes to constructor functions with | |
462 | the following prototype: | |
463 | ||
464 | .. code-block :: python | |
465 | ||
466 | def constructor(obj, attrs): | |
467 | ||
468 | This dictionary can be used to support custom R classes. By default, | |
469 | the dictionary used is | |
470 | :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP` | |
471 | which has support for several common classes. | |
472 | default_encoding: | |
473 | Default encoding used for strings with unknown encoding. If `None`, | |
474 | the one stored in the file will be used, or ASCII as a fallback. | |
475 | force_default_encoding: | |
476 | Use the default encoding even if the strings specify other encoding. | |
477 | ||
478 | """ | |
479 | ||
480 | def __init__( | |
481 | self, | |
482 | constructor_dict: Mapping[ | |
483 | Union[str, bytes], | |
484 | Constructor, | |
485 | ] = DEFAULT_CLASS_MAP, | |
486 | default_encoding: Optional[str] = None, | |
487 | force_default_encoding: bool = False, | |
488 | global_environment: Optional[Mapping[Union[str, bytes], Any]] = None, | |
489 | ) -> None: | |
490 | ||
491 | self.constructor_dict = constructor_dict | |
492 | self.default_encoding = default_encoding | |
493 | self.force_default_encoding = force_default_encoding | |
494 | self.global_environment = ChainMap( | |
495 | {} if global_environment is None | |
496 | else global_environment | |
497 | ) | |
498 | self.empty_environment: Mapping[Union[str, bytes], Any] = ChainMap({}) | |
499 | ||
500 | self._reset() | |
501 | ||
502 | def _reset(self) -> None: | |
503 | self.references: MutableMapping[int, Any] = {} | |
504 | self.default_encoding_used = self.default_encoding | |
505 | ||
506 | def convert(self, data: Union[parser.RData, parser.RObject]) -> Any: | |
507 | self._reset() | |
508 | return self._convert_next(data) | |
509 | ||
510 | def _convert_next(self, data: Union[parser.RData, parser.RObject]) -> Any: | |
511 | """ | |
512 | Convert a R object to a Python one. | |
513 | """ | |
514 | ||
515 | obj: RObject | |
516 | if isinstance(data, parser.RData): | |
517 | obj = data.object | |
518 | if self.default_encoding is None: | |
519 | self.default_encoding_used = data.extra.encoding | |
520 | else: | |
521 | obj = data | |
522 | ||
523 | attrs = convert_attrs(obj, self._convert_next) | |
524 | ||
525 | reference_id = id(obj) | |
526 | ||
527 | # Return the value if previously referenced | |
528 | value: Any = self.references.get(id(obj)) | |
529 | if value is not None: | |
530 | pass | |
531 | ||
532 | if obj.info.type == parser.RObjectType.SYM: | |
533 | ||
534 | # Return the internal string | |
535 | value = convert_symbol(obj, self._convert_next) | |
536 | ||
537 | elif obj.info.type == parser.RObjectType.LIST: | |
538 | ||
539 | # Expand the list and process the elements | |
540 | value = convert_list(obj, self._convert_next) | |
541 | ||
542 | elif obj.info.type == parser.RObjectType.ENV: | |
543 | ||
544 | # Return a ChainMap of the environments | |
545 | value = convert_env(obj, self._convert_next) | |
546 | ||
547 | elif obj.info.type == parser.RObjectType.LANG: | |
548 | ||
549 | # Expand the list and process the elements, returning a | |
550 | # special object | |
551 | rlanguage_list = convert_list(obj, self._convert_next) | |
552 | assert isinstance(rlanguage_list, list) | |
553 | ||
554 | value = RLanguage(rlanguage_list) | |
555 | ||
556 | elif obj.info.type == parser.RObjectType.CHAR: | |
557 | ||
558 | # Return the internal string | |
559 | value = convert_char( | |
560 | obj, | |
561 | default_encoding=self.default_encoding_used, | |
562 | force_default_encoding=self.force_default_encoding, | |
563 | ) | |
564 | ||
565 | elif obj.info.type in {parser.RObjectType.LGL, | |
566 | parser.RObjectType.INT, | |
567 | parser.RObjectType.REAL, | |
568 | parser.RObjectType.CPLX}: | |
569 | ||
570 | # Return the internal array | |
571 | value = convert_array(obj, self._convert_next, attrs=attrs) | |
572 | ||
573 | elif obj.info.type == parser.RObjectType.STR: | |
574 | ||
575 | # Convert the internal strings | |
576 | value = [self._convert_next(o) for o in obj.value] | |
577 | ||
578 | elif obj.info.type == parser.RObjectType.VEC: | |
579 | ||
580 | # Convert the internal objects | |
581 | value = convert_vector(obj, self._convert_next, attrs=attrs) | |
582 | ||
583 | elif obj.info.type == parser.RObjectType.EXPR: | |
584 | rexpression_list = convert_vector( | |
585 | obj, self._convert_next, attrs=attrs) | |
586 | assert isinstance(rexpression_list, list) | |
587 | ||
588 | # Convert the internal objects returning a special object | |
589 | value = RExpression(rexpression_list) | |
590 | ||
591 | elif obj.info.type == parser.RObjectType.S4: | |
592 | value = SimpleNamespace(**attrs) | |
593 | ||
594 | elif obj.info.type == parser.RObjectType.EMPTYENV: | |
595 | value = self.empty_environment | |
596 | ||
597 | elif obj.info.type == parser.RObjectType.GLOBALENV: | |
598 | value = self.global_environment | |
599 | ||
600 | elif obj.info.type == parser.RObjectType.REF: | |
601 | ||
602 | # Return the referenced value | |
603 | value = self.references.get(id(obj.referenced_object)) | |
604 | # value = self.references[id(obj.referenced_object)] | |
605 | if value is None: | |
606 | reference_id = id(obj.referenced_object) | |
607 | assert obj.referenced_object is not None | |
608 | value = self._convert_next(obj.referenced_object) | |
609 | ||
610 | elif obj.info.type == parser.RObjectType.NILVALUE: | |
611 | ||
612 | value = None | |
613 | ||
614 | else: | |
615 | raise NotImplementedError(f"Type {obj.info.type} not implemented") | |
616 | ||
617 | if obj.info.object: | |
618 | classname = attrs["class"] | |
619 | for i, c in enumerate(classname): | |
620 | ||
621 | constructor = self.constructor_dict.get(c, None) | |
622 | ||
623 | if constructor: | |
624 | new_value = constructor(value, attrs) | |
625 | else: | |
626 | new_value = NotImplemented | |
627 | ||
628 | if new_value is NotImplemented: | |
629 | missing_msg = (f"Missing constructor for R class " | |
630 | f"\"{c}\". ") | |
631 | ||
632 | if len(classname) > (i + 1): | |
633 | solution_msg = (f"The constructor for class " | |
634 | f"\"{classname[i+1]}\" will be " | |
635 | f"used instead." | |
636 | ) | |
637 | else: | |
638 | solution_msg = ("The underlying R object is " | |
639 | "returned instead.") | |
640 | ||
641 | warnings.warn(missing_msg + solution_msg, | |
642 | stacklevel=1) | |
643 | else: | |
644 | value = new_value | |
645 | break | |
646 | ||
647 | self.references[reference_id] = value | |
648 | ||
649 | return value | |
650 | ||
651 | ||
652 | def convert( | |
653 | data: Union[parser.RData, parser.RObject], | |
654 | *args: Any, | |
655 | **kwargs: Any, | |
656 | ) -> Any: | |
657 | """ | |
658 | Uses the default converter (:func:`SimpleConverter`) to convert the data. | |
659 | ||
660 | Examples: | |
661 | ||
662 | Parse one of the included examples, containing a vector | |
663 | ||
664 | >>> import rdata | |
665 | >>> | |
666 | >>> parsed = rdata.parser.parse_file( | |
667 | ... rdata.TESTDATA_PATH / "test_vector.rda") | |
668 | >>> converted = rdata.conversion.convert(parsed) | |
669 | >>> converted | |
670 | {'test_vector': array([1., 2., 3.])} | |
671 | ||
672 | Parse another example, containing a dataframe | |
673 | ||
674 | >>> import rdata | |
675 | >>> | |
676 | >>> parsed = rdata.parser.parse_file( | |
677 | ... rdata.TESTDATA_PATH / "test_dataframe.rda") | |
678 | >>> converted = rdata.conversion.convert(parsed) | |
679 | >>> converted | |
680 | {'test_dataframe': class value | |
681 | 0 a 1 | |
682 | 1 b 2 | |
683 | 2 b 3} | |
684 | ||
685 | """ | |
686 | return SimpleConverter(*args, **kwargs).convert(data) |
0 | from ._parser import ( | |
1 | DEFAULT_ALTREP_MAP, | |
2 | CharFlags, | |
3 | RData, | |
4 | RObject, | |
5 | RObjectInfo, | |
6 | RObjectType, | |
7 | parse_data, | |
8 | parse_file, | |
9 | ) |
0 | from __future__ import annotations | |
1 | ||
2 | import abc | |
3 | import bz2 | |
4 | import enum | |
5 | import gzip | |
6 | import lzma | |
7 | import os | |
8 | import pathlib | |
9 | import warnings | |
10 | import xdrlib | |
11 | from dataclasses import dataclass | |
12 | from types import MappingProxyType | |
13 | from typing import ( | |
14 | Any, | |
15 | BinaryIO, | |
16 | Callable, | |
17 | List, | |
18 | Mapping, | |
19 | Optional, | |
20 | Set, | |
21 | TextIO, | |
22 | Tuple, | |
23 | Union, | |
24 | ) | |
25 | ||
26 | import numpy as np | |
27 | ||
28 | ||
29 | class FileTypes(enum.Enum): | |
30 | """ | |
31 | Type of file containing a R file. | |
32 | """ | |
33 | bzip2 = "bz2" | |
34 | gzip = "gzip" | |
35 | xz = "xz" | |
36 | rdata_binary_v2 = "rdata version 2 (binary)" | |
37 | rdata_binary_v3 = "rdata version 3 (binary)" | |
38 | ||
39 | ||
40 | magic_dict = { | |
41 | FileTypes.bzip2: b"\x42\x5a\x68", | |
42 | FileTypes.gzip: b"\x1f\x8b", | |
43 | FileTypes.xz: b"\xFD7zXZ\x00", | |
44 | FileTypes.rdata_binary_v2: b"RDX2\n", | |
45 | FileTypes.rdata_binary_v3: b"RDX3\n" | |
46 | } | |
47 | ||
48 | ||
49 | def file_type(data: memoryview) -> Optional[FileTypes]: | |
50 | """ | |
51 | Returns the type of the file. | |
52 | """ | |
53 | ||
54 | for filetype, magic in magic_dict.items(): | |
55 | if data[:len(magic)] == magic: | |
56 | return filetype | |
57 | return None | |
58 | ||
59 | ||
60 | class RdataFormats(enum.Enum): | |
61 | """ | |
62 | Format of a R file. | |
63 | """ | |
64 | XDR = "XDR" | |
65 | ASCII = "ASCII" | |
66 | binary = "binary" | |
67 | ||
68 | ||
69 | format_dict = { | |
70 | RdataFormats.XDR: b"X\n", | |
71 | RdataFormats.ASCII: b"A\n", | |
72 | RdataFormats.binary: b"B\n", | |
73 | } | |
74 | ||
75 | ||
76 | def rdata_format(data: memoryview) -> Optional[RdataFormats]: | |
77 | """ | |
78 | Returns the format of the data. | |
79 | """ | |
80 | ||
81 | for format_type, magic in format_dict.items(): | |
82 | if data[:len(magic)] == magic: | |
83 | return format_type | |
84 | return None | |
85 | ||
86 | ||
87 | class RObjectType(enum.Enum): | |
88 | """ | |
89 | Type of a R object. | |
90 | """ | |
91 | NIL = 0 # NULL | |
92 | SYM = 1 # symbols | |
93 | LIST = 2 # pairlists | |
94 | CLO = 3 # closures | |
95 | ENV = 4 # environments | |
96 | PROM = 5 # promises | |
97 | LANG = 6 # language objects | |
98 | SPECIAL = 7 # special functions | |
99 | BUILTIN = 8 # builtin functions | |
100 | CHAR = 9 # internal character strings | |
101 | LGL = 10 # logical vectors | |
102 | INT = 13 # integer vectors | |
103 | REAL = 14 # numeric vectors | |
104 | CPLX = 15 # complex vectors | |
105 | STR = 16 # character vectors | |
106 | DOT = 17 # dot-dot-dot object | |
107 | ANY = 18 # make “any” args work | |
108 | VEC = 19 # list (generic vector) | |
109 | EXPR = 20 # expression vector | |
110 | BCODE = 21 # byte code | |
111 | EXTPTR = 22 # external pointer | |
112 | WEAKREF = 23 # weak reference | |
113 | RAW = 24 # raw vector | |
114 | S4 = 25 # S4 classes not of simple type | |
115 | ALTREP = 238 # Alternative representations | |
116 | EMPTYENV = 242 # Empty environment | |
117 | GLOBALENV = 253 # Global environment | |
118 | NILVALUE = 254 # NIL value | |
119 | REF = 255 # Reference | |
120 | ||
121 | ||
122 | class CharFlags(enum.IntFlag): | |
123 | HAS_HASH = 1 | |
124 | BYTES = 1 << 1 | |
125 | LATIN1 = 1 << 2 | |
126 | UTF8 = 1 << 3 | |
127 | CACHED = 1 << 5 | |
128 | ASCII = 1 << 6 | |
129 | ||
130 | ||
131 | @dataclass | |
132 | class RVersions(): | |
133 | """ | |
134 | R versions. | |
135 | """ | |
136 | format: int | |
137 | serialized: int | |
138 | minimum: int | |
139 | ||
140 | ||
141 | @dataclass | |
142 | class RExtraInfo(): | |
143 | """ | |
144 | Extra information. | |
145 | ||
146 | Contains the default encoding (only in version 3). | |
147 | """ | |
148 | encoding: Optional[str] = None | |
149 | ||
150 | ||
151 | @dataclass | |
152 | class RObjectInfo(): | |
153 | """ | |
154 | Internal attributes of a R object. | |
155 | """ | |
156 | type: RObjectType | |
157 | object: bool | |
158 | attributes: bool | |
159 | tag: bool | |
160 | gp: int | |
161 | reference: int | |
162 | ||
163 | ||
164 | @dataclass | |
165 | class RObject(): | |
166 | """ | |
167 | Representation of a R object. | |
168 | """ | |
169 | info: RObjectInfo | |
170 | value: Any | |
171 | attributes: Optional[RObject] | |
172 | tag: Optional[RObject] = None | |
173 | referenced_object: Optional[RObject] = None | |
174 | ||
175 | def _str_internal( | |
176 | self, | |
177 | indent: int = 0, | |
178 | used_references: Optional[Set[int]] = None | |
179 | ) -> str: | |
180 | ||
181 | if used_references is None: | |
182 | used_references = set() | |
183 | ||
184 | string = "" | |
185 | ||
186 | string += f"{' ' * indent}{self.info.type}\n" | |
187 | ||
188 | if self.tag: | |
189 | tag_string = self.tag._str_internal(indent + 4, | |
190 | used_references.copy()) | |
191 | string += f"{' ' * (indent + 2)}tag:\n{tag_string}\n" | |
192 | ||
193 | if self.info.reference: | |
194 | assert self.referenced_object | |
195 | reference_string = (f"{' ' * (indent + 4)}..." | |
196 | if self.info.reference in used_references | |
197 | else self.referenced_object._str_internal( | |
198 | indent + 4, used_references.copy())) | |
199 | string += (f"{' ' * (indent + 2)}reference: " | |
200 | f"{self.info.reference}\n{reference_string}\n") | |
201 | ||
202 | string += f"{' ' * (indent + 2)}value:\n" | |
203 | ||
204 | if isinstance(self.value, RObject): | |
205 | string += self.value._str_internal(indent + 4, | |
206 | used_references.copy()) | |
207 | elif isinstance(self.value, tuple) or isinstance(self.value, list): | |
208 | for elem in self.value: | |
209 | string += elem._str_internal(indent + 4, | |
210 | used_references.copy()) | |
211 | elif isinstance(self.value, np.ndarray): | |
212 | string += " " * (indent + 4) | |
213 | if len(self.value) > 4: | |
214 | string += (f"[{self.value[0]}, {self.value[1]} ... " | |
215 | f"{self.value[-2]}, {self.value[-1]}]\n") | |
216 | else: | |
217 | string += f"{self.value}\n" | |
218 | else: | |
219 | string += f"{' ' * (indent + 4)}{self.value}\n" | |
220 | ||
221 | if(self.attributes): | |
222 | attr_string = self.attributes._str_internal( | |
223 | indent + 4, | |
224 | used_references.copy()) | |
225 | string += f"{' ' * (indent + 2)}attributes:\n{attr_string}\n" | |
226 | ||
227 | return string | |
228 | ||
229 | def __str__(self) -> str: | |
230 | return self._str_internal() | |
231 | ||
232 | ||
233 | @dataclass | |
234 | class RData(): | |
235 | """ | |
236 | Data contained in a R file. | |
237 | """ | |
238 | versions: RVersions | |
239 | extra: RExtraInfo | |
240 | object: RObject | |
241 | ||
242 | ||
243 | @dataclass | |
244 | class EnvironmentValue(): | |
245 | """ | |
246 | Value of an environment. | |
247 | """ | |
248 | locked: bool | |
249 | enclosure: RObject | |
250 | frame: RObject | |
251 | hash_table: RObject | |
252 | ||
253 | ||
254 | AltRepConstructor = Callable[ | |
255 | [RObject], | |
256 | Tuple[RObjectInfo, Any], | |
257 | ] | |
258 | AltRepConstructorMap = Mapping[bytes, AltRepConstructor] | |
259 | ||
260 | ||
261 | def format_float_with_scipen(number: float, scipen: int) -> bytes: | |
262 | fixed = np.format_float_positional(number, trim="-") | |
263 | scientific = np.format_float_scientific(number, trim="-") | |
264 | ||
265 | assert(isinstance(fixed, str)) | |
266 | assert(isinstance(scientific, str)) | |
267 | ||
268 | return ( | |
269 | scientific if len(fixed) - len(scientific) > scipen | |
270 | else fixed | |
271 | ).encode() | |
272 | ||
273 | ||
274 | def deferred_string_constructor( | |
275 | state: RObject, | |
276 | ) -> Tuple[RObjectInfo, Any]: | |
277 | ||
278 | new_info = RObjectInfo( | |
279 | type=RObjectType.STR, | |
280 | object=False, | |
281 | attributes=False, | |
282 | tag=False, | |
283 | gp=0, | |
284 | reference=0, | |
285 | ) | |
286 | ||
287 | object_to_format = state.value[0].value | |
288 | scipen = state.value[1].value | |
289 | ||
290 | value = [ | |
291 | RObject( | |
292 | info=RObjectInfo( | |
293 | type=RObjectType.CHAR, | |
294 | object=False, | |
295 | attributes=False, | |
296 | tag=False, | |
297 | gp=CharFlags.ASCII, | |
298 | reference=0, | |
299 | ), | |
300 | value=format_float_with_scipen(num, scipen), | |
301 | attributes=None, | |
302 | tag=None, | |
303 | referenced_object=None, | |
304 | ) | |
305 | for num in object_to_format | |
306 | ] | |
307 | ||
308 | return new_info, value | |
309 | ||
310 | ||
311 | def compact_seq_constructor( | |
312 | state: RObject, | |
313 | *, | |
314 | is_int: bool = False | |
315 | ) -> Tuple[RObjectInfo, Any]: | |
316 | ||
317 | new_info = RObjectInfo( | |
318 | type=RObjectType.INT if is_int else RObjectType.REAL, | |
319 | object=False, | |
320 | attributes=False, | |
321 | tag=False, | |
322 | gp=0, | |
323 | reference=0, | |
324 | ) | |
325 | ||
326 | start = state.value[1] | |
327 | stop = state.value[0] | |
328 | step = state.value[2] | |
329 | ||
330 | if is_int: | |
331 | start = int(start) | |
332 | stop = int(stop) | |
333 | step = int(step) | |
334 | ||
335 | value = np.arange(start, stop, step) | |
336 | ||
337 | return new_info, value | |
338 | ||
339 | ||
340 | def compact_intseq_constructor( | |
341 | state: RObject, | |
342 | ) -> Tuple[RObjectInfo, Any]: | |
343 | return compact_seq_constructor(state, is_int=True) | |
344 | ||
345 | ||
346 | def compact_realseq_constructor( | |
347 | state: RObject, | |
348 | ) -> Tuple[RObjectInfo, Any]: | |
349 | return compact_seq_constructor(state, is_int=False) | |
350 | ||
351 | ||
352 | def wrap_constructor( | |
353 | state: RObject, | |
354 | ) -> Tuple[RObjectInfo, Any]: | |
355 | ||
356 | new_info = RObjectInfo( | |
357 | type=state.value[0].info.type, | |
358 | object=False, | |
359 | attributes=False, | |
360 | tag=False, | |
361 | gp=0, | |
362 | reference=0, | |
363 | ) | |
364 | ||
365 | value = state.value[0].value | |
366 | ||
367 | return new_info, value | |
368 | ||
369 | ||
370 | default_altrep_map_dict: Mapping[bytes, AltRepConstructor] = { | |
371 | b"deferred_string": deferred_string_constructor, | |
372 | b"compact_intseq": compact_intseq_constructor, | |
373 | b"compact_realseq": compact_realseq_constructor, | |
374 | b"wrap_real": wrap_constructor, | |
375 | b"wrap_string": wrap_constructor, | |
376 | b"wrap_logical": wrap_constructor, | |
377 | b"wrap_integer": wrap_constructor, | |
378 | b"wrap_complex": wrap_constructor, | |
379 | b"wrap_raw": wrap_constructor, | |
380 | } | |
381 | ||
382 | DEFAULT_ALTREP_MAP = MappingProxyType(default_altrep_map_dict) | |
383 | ||
384 | ||
385 | class Parser(abc.ABC): | |
386 | """ | |
387 | Parser interface for a R file. | |
388 | """ | |
389 | ||
390 | def __init__( | |
391 | self, | |
392 | *, | |
393 | expand_altrep: bool = True, | |
394 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, | |
395 | ): | |
396 | self.expand_altrep = expand_altrep | |
397 | self.altrep_constructor_dict = altrep_constructor_dict | |
398 | ||
399 | def parse_bool(self) -> bool: | |
400 | """ | |
401 | Parse a boolean. | |
402 | """ | |
403 | return bool(self.parse_int()) | |
404 | ||
405 | @abc.abstractmethod | |
406 | def parse_int(self) -> int: | |
407 | """ | |
408 | Parse an integer. | |
409 | """ | |
410 | pass | |
411 | ||
412 | @abc.abstractmethod | |
413 | def parse_double(self) -> float: | |
414 | """ | |
415 | Parse a double. | |
416 | """ | |
417 | pass | |
418 | ||
419 | def parse_complex(self) -> complex: | |
420 | """ | |
421 | Parse a complex number. | |
422 | """ | |
423 | return complex(self.parse_double(), self.parse_double()) | |
424 | ||
425 | @abc.abstractmethod | |
426 | def parse_string(self, length: int) -> bytes: | |
427 | """ | |
428 | Parse a string. | |
429 | """ | |
430 | pass | |
431 | ||
432 | def parse_all(self) -> RData: | |
433 | """ | |
434 | Parse all the file. | |
435 | """ | |
436 | ||
437 | versions = self.parse_versions() | |
438 | extra_info = self.parse_extra_info(versions) | |
439 | obj = self.parse_R_object() | |
440 | ||
441 | return RData(versions, extra_info, obj) | |
442 | ||
443 | def parse_versions(self) -> RVersions: | |
444 | """ | |
445 | Parse the versions header. | |
446 | """ | |
447 | ||
448 | format_version = self.parse_int() | |
449 | r_version = self.parse_int() | |
450 | minimum_r_version = self.parse_int() | |
451 | ||
452 | if format_version not in [2, 3]: | |
453 | raise NotImplementedError( | |
454 | f"Format version {format_version} unsupported", | |
455 | ) | |
456 | ||
457 | return RVersions(format_version, r_version, minimum_r_version) | |
458 | ||
459 | def parse_extra_info(self, versions: RVersions) -> RExtraInfo: | |
460 | """ | |
461 | Parse the versions header. | |
462 | """ | |
463 | ||
464 | encoding = None | |
465 | ||
466 | if versions.format >= 3: | |
467 | encoding_len = self.parse_int() | |
468 | encoding = self.parse_string(encoding_len).decode("ASCII") | |
469 | ||
470 | extra_info = RExtraInfo(encoding) | |
471 | ||
472 | return extra_info | |
473 | ||
474 | def expand_altrep_to_object( | |
475 | self, | |
476 | info: RObject, | |
477 | state: RObject, | |
478 | ) -> Tuple[RObjectInfo, Any]: | |
479 | """Expand alternative representation to normal object.""" | |
480 | ||
481 | assert info.info.type == RObjectType.LIST | |
482 | ||
483 | class_sym = info.value[0] | |
484 | while class_sym.info.type == RObjectType.REF: | |
485 | class_sym = class_sym.referenced_object | |
486 | ||
487 | assert class_sym.info.type == RObjectType.SYM | |
488 | assert class_sym.value.info.type == RObjectType.CHAR | |
489 | ||
490 | altrep_name = class_sym.value.value | |
491 | assert isinstance(altrep_name, bytes) | |
492 | ||
493 | constructor = self.altrep_constructor_dict[altrep_name] | |
494 | return constructor(state) | |
495 | ||
496 | def parse_R_object( | |
497 | self, | |
498 | reference_list: Optional[List[RObject]] = None | |
499 | ) -> RObject: | |
500 | """ | |
501 | Parse a R object. | |
502 | """ | |
503 | ||
504 | if reference_list is None: | |
505 | # Index is 1-based, so we insert a dummy object | |
506 | reference_list = [] | |
507 | ||
508 | info_int = self.parse_int() | |
509 | ||
510 | info = parse_r_object_info(info_int) | |
511 | ||
512 | tag = None | |
513 | attributes = None | |
514 | referenced_object = None | |
515 | ||
516 | tag_read = False | |
517 | attributes_read = False | |
518 | add_reference = False | |
519 | ||
520 | result = None | |
521 | ||
522 | value: Any | |
523 | ||
524 | if info.type == RObjectType.NIL: | |
525 | value = None | |
526 | ||
527 | elif info.type == RObjectType.SYM: | |
528 | # Read Char | |
529 | value = self.parse_R_object(reference_list) | |
530 | # Symbols can be referenced | |
531 | add_reference = True | |
532 | ||
533 | elif info.type in [RObjectType.LIST, RObjectType.LANG]: | |
534 | tag = None | |
535 | if info.attributes: | |
536 | attributes = self.parse_R_object(reference_list) | |
537 | attributes_read = True | |
538 | elif info.tag: | |
539 | tag = self.parse_R_object(reference_list) | |
540 | tag_read = True | |
541 | ||
542 | # Read CAR and CDR | |
543 | car = self.parse_R_object(reference_list) | |
544 | cdr = self.parse_R_object(reference_list) | |
545 | value = (car, cdr) | |
546 | ||
547 | elif info.type == RObjectType.ENV: | |
548 | result = RObject( | |
549 | info=info, | |
550 | tag=tag, | |
551 | attributes=attributes, | |
552 | value=None, | |
553 | referenced_object=referenced_object, | |
554 | ) | |
555 | ||
556 | reference_list.append(result) | |
557 | ||
558 | locked = self.parse_bool() | |
559 | enclosure = self.parse_R_object(reference_list) | |
560 | frame = self.parse_R_object(reference_list) | |
561 | hash_table = self.parse_R_object(reference_list) | |
562 | attributes = self.parse_R_object(reference_list) | |
563 | ||
564 | value = EnvironmentValue( | |
565 | locked=locked, | |
566 | enclosure=enclosure, | |
567 | frame=frame, | |
568 | hash_table=hash_table, | |
569 | ) | |
570 | ||
571 | elif info.type == RObjectType.CHAR: | |
572 | length = self.parse_int() | |
573 | if length > 0: | |
574 | value = self.parse_string(length=length) | |
575 | elif length == 0: | |
576 | value = b"" | |
577 | elif length == -1: | |
578 | value = None | |
579 | else: | |
580 | raise NotImplementedError( | |
581 | f"Length of CHAR cannot be {length}") | |
582 | ||
583 | elif info.type == RObjectType.LGL: | |
584 | length = self.parse_int() | |
585 | ||
586 | value = np.empty(length, dtype=np.bool_) | |
587 | ||
588 | for i in range(length): | |
589 | value[i] = self.parse_bool() | |
590 | ||
591 | elif info.type == RObjectType.INT: | |
592 | length = self.parse_int() | |
593 | ||
594 | value = np.empty(length, dtype=np.int64) | |
595 | ||
596 | for i in range(length): | |
597 | value[i] = self.parse_int() | |
598 | ||
599 | elif info.type == RObjectType.REAL: | |
600 | length = self.parse_int() | |
601 | ||
602 | value = np.empty(length, dtype=np.double) | |
603 | ||
604 | for i in range(length): | |
605 | value[i] = self.parse_double() | |
606 | ||
607 | elif info.type == RObjectType.CPLX: | |
608 | length = self.parse_int() | |
609 | ||
610 | value = np.empty(length, dtype=np.complex_) | |
611 | ||
612 | for i in range(length): | |
613 | value[i] = self.parse_complex() | |
614 | ||
615 | elif info.type in [RObjectType.STR, | |
616 | RObjectType.VEC, RObjectType.EXPR]: | |
617 | length = self.parse_int() | |
618 | ||
619 | value = [None] * length | |
620 | ||
621 | for i in range(length): | |
622 | value[i] = self.parse_R_object(reference_list) | |
623 | ||
624 | elif info.type == RObjectType.S4: | |
625 | value = None | |
626 | ||
627 | elif info.type == RObjectType.ALTREP: | |
628 | altrep_info = self.parse_R_object(reference_list) | |
629 | altrep_state = self.parse_R_object(reference_list) | |
630 | altrep_attr = self.parse_R_object(reference_list) | |
631 | ||
632 | if self.expand_altrep: | |
633 | info, value = self.expand_altrep_to_object( | |
634 | info=altrep_info, | |
635 | state=altrep_state, | |
636 | ) | |
637 | attributes = altrep_attr | |
638 | else: | |
639 | value = (altrep_info, altrep_state, altrep_attr) | |
640 | ||
641 | elif info.type == RObjectType.EMPTYENV: | |
642 | value = None | |
643 | ||
644 | elif info.type == RObjectType.GLOBALENV: | |
645 | value = None | |
646 | ||
647 | elif info.type == RObjectType.NILVALUE: | |
648 | value = None | |
649 | ||
650 | elif info.type == RObjectType.REF: | |
651 | value = None | |
652 | # Index is 1-based | |
653 | referenced_object = reference_list[info.reference - 1] | |
654 | ||
655 | else: | |
656 | raise NotImplementedError(f"Type {info.type} not implemented") | |
657 | ||
658 | if info.tag and not tag_read: | |
659 | warnings.warn(f"Tag not implemented for type {info.type} " | |
660 | "and ignored") | |
661 | if info.attributes and not attributes_read: | |
662 | attributes = self.parse_R_object(reference_list) | |
663 | ||
664 | if result is None: | |
665 | result = RObject( | |
666 | info=info, | |
667 | tag=tag, | |
668 | attributes=attributes, | |
669 | value=value, | |
670 | referenced_object=referenced_object, | |
671 | ) | |
672 | else: | |
673 | result.info = info | |
674 | result.attributes = attributes | |
675 | result.value = value | |
676 | result.referenced_object = referenced_object | |
677 | ||
678 | if add_reference: | |
679 | reference_list.append(result) | |
680 | ||
681 | return result | |
682 | ||
683 | ||
684 | class ParserXDR(Parser): | |
685 | """ | |
686 | Parser used when the integers and doubles are in XDR format. | |
687 | """ | |
688 | ||
689 | def __init__( | |
690 | self, | |
691 | data: memoryview, | |
692 | position: int = 0, | |
693 | *, | |
694 | expand_altrep: bool = True, | |
695 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, | |
696 | ) -> None: | |
697 | super().__init__( | |
698 | expand_altrep=expand_altrep, | |
699 | altrep_constructor_dict=altrep_constructor_dict, | |
700 | ) | |
701 | self.data = data | |
702 | self.position = position | |
703 | self.xdr_parser = xdrlib.Unpacker(data) | |
704 | ||
705 | def parse_int(self) -> int: | |
706 | self.xdr_parser.set_position(self.position) | |
707 | result = self.xdr_parser.unpack_int() | |
708 | self.position = self.xdr_parser.get_position() | |
709 | ||
710 | return result | |
711 | ||
712 | def parse_double(self) -> float: | |
713 | self.xdr_parser.set_position(self.position) | |
714 | result = self.xdr_parser.unpack_double() | |
715 | self.position = self.xdr_parser.get_position() | |
716 | ||
717 | return result | |
718 | ||
719 | def parse_string(self, length: int) -> bytes: | |
720 | result = self.data[self.position:(self.position + length)] | |
721 | self.position += length | |
722 | return bytes(result) | |
723 | ||
724 | ||
725 | def parse_file( | |
726 | file_or_path: Union[BinaryIO, TextIO, 'os.PathLike[Any]', str], | |
727 | *, | |
728 | expand_altrep: bool = True, | |
729 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, | |
730 | ) -> RData: | |
731 | """ | |
732 | Parse a R file (.rda or .rdata). | |
733 | ||
734 | Parameters: | |
735 | file_or_path (file-like, str, bytes or path-like): File | |
736 | in the R serialization format. | |
737 | expand_altrep (bool): Wether to translate ALTREPs to normal objects. | |
738 | altrep_constructor_dict: Dictionary mapping each ALTREP to | |
739 | its constructor. | |
740 | ||
741 | Returns: | |
742 | RData: Data contained in the file (versions and object). | |
743 | ||
744 | See Also: | |
745 | :func:`parse_data`: Similar function that receives the data directly. | |
746 | ||
747 | Examples: | |
748 | ||
749 | Parse one of the included examples, containing a vector | |
750 | ||
751 | >>> import rdata | |
752 | >>> | |
753 | >>> parsed = rdata.parser.parse_file( | |
754 | ... rdata.TESTDATA_PATH / "test_vector.rda") | |
755 | >>> parsed | |
756 | RData(versions=RVersions(format=2, | |
757 | serialized=196610, | |
758 | minimum=131840), | |
759 | extra=RExtraInfo(encoding=None), | |
760 | object=RObject(info=RObjectInfo(type=<RObjectType.LIST: 2>, | |
761 | object=False, | |
762 | attributes=False, | |
763 | tag=True, | |
764 | gp=0, | |
765 | reference=0), | |
766 | value=(RObject(info=RObjectInfo(type=<RObjectType.REAL: 14>, | |
767 | object=False, | |
768 | attributes=False, | |
769 | tag=False, | |
770 | gp=0, | |
771 | reference=0), | |
772 | value=array([1., 2., 3.]), | |
773 | attributes=None, | |
774 | tag=None, | |
775 | referenced_object=None), | |
776 | RObject(info=RObjectInfo(type=<RObjectType.NILVALUE: 254>, | |
777 | object=False, | |
778 | attributes=False, | |
779 | tag=False, | |
780 | gp=0, | |
781 | reference=0), | |
782 | value=None, | |
783 | attributes=None, | |
784 | tag=None, | |
785 | referenced_object=None)), | |
786 | attributes=None, | |
787 | tag=RObject(info=RObjectInfo(type=<RObjectType.SYM: 1>, | |
788 | object=False, | |
789 | attributes=False, | |
790 | tag=False, | |
791 | gp=0, | |
792 | reference=0), | |
793 | value=RObject(info=RObjectInfo(type=<RObjectType.CHAR: 9>, | |
794 | object=False, | |
795 | attributes=False, | |
796 | tag=False, | |
797 | gp=64, | |
798 | reference=0), | |
799 | value=b'test_vector', | |
800 | attributes=None, | |
801 | tag=None, | |
802 | referenced_object=None), | |
803 | attributes=None, | |
804 | tag=None, | |
805 | referenced_object=None), | |
806 | referenced_object=None)) | |
807 | ||
808 | """ | |
809 | if isinstance(file_or_path, (os.PathLike, str)): | |
810 | path = pathlib.Path(file_or_path) | |
811 | data = path.read_bytes() | |
812 | else: | |
813 | # file is a pre-opened file | |
814 | buffer: Optional[BinaryIO] = getattr(file_or_path, 'buffer', None) | |
815 | if buffer is None: | |
816 | assert isinstance(file_or_path, BinaryIO) | |
817 | binary_file: BinaryIO = file_or_path | |
818 | else: | |
819 | binary_file = buffer | |
820 | data = binary_file.read() | |
821 | return parse_data( | |
822 | data, | |
823 | expand_altrep=expand_altrep, | |
824 | altrep_constructor_dict=altrep_constructor_dict, | |
825 | ) | |
826 | ||
827 | ||
828 | def parse_data( | |
829 | data: bytes, | |
830 | *, | |
831 | expand_altrep: bool = True, | |
832 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, | |
833 | ) -> RData: | |
834 | """ | |
835 | Parse the data of a R file, received as a sequence of bytes. | |
836 | ||
837 | Parameters: | |
838 | data (bytes): Data extracted of a R file. | |
839 | expand_altrep (bool): Wether to translate ALTREPs to normal objects. | |
840 | altrep_constructor_dict: Dictionary mapping each ALTREP to | |
841 | its constructor. | |
842 | ||
843 | Returns: | |
844 | RData: Data contained in the file (versions and object). | |
845 | ||
846 | See Also: | |
847 | :func:`parse_file`: Similar function that parses a file directly. | |
848 | ||
849 | Examples: | |
850 | ||
851 | Parse one of the included examples, containing a vector | |
852 | ||
853 | >>> import rdata | |
854 | >>> | |
855 | >>> with open(rdata.TESTDATA_PATH / "test_vector.rda", "rb") as f: | |
856 | ... parsed = rdata.parser.parse_data(f.read()) | |
857 | >>> | |
858 | >>> parsed | |
859 | RData(versions=RVersions(format=2, | |
860 | serialized=196610, | |
861 | minimum=131840), | |
862 | extra=RExtraInfo(encoding=None), | |
863 | object=RObject(info=RObjectInfo(type=<RObjectType.LIST: 2>, | |
864 | object=False, | |
865 | attributes=False, | |
866 | tag=True, | |
867 | gp=0, | |
868 | reference=0), | |
869 | value=(RObject(info=RObjectInfo(type=<RObjectType.REAL: 14>, | |
870 | object=False, | |
871 | attributes=False, | |
872 | tag=False, | |
873 | gp=0, | |
874 | reference=0), | |
875 | value=array([1., 2., 3.]), | |
876 | attributes=None, | |
877 | tag=None, | |
878 | referenced_object=None), | |
879 | RObject(info=RObjectInfo(type=<RObjectType.NILVALUE: 254>, | |
880 | object=False, | |
881 | attributes=False, | |
882 | tag=False, | |
883 | gp=0, | |
884 | reference=0), | |
885 | value=None, | |
886 | attributes=None, | |
887 | tag=None, | |
888 | referenced_object=None)), | |
889 | attributes=None, | |
890 | tag=RObject(info=RObjectInfo(type=<RObjectType.SYM: 1>, | |
891 | object=False, | |
892 | attributes=False, | |
893 | tag=False, | |
894 | gp=0, | |
895 | reference=0), | |
896 | value=RObject(info=RObjectInfo(type=<RObjectType.CHAR: 9>, | |
897 | object=False, | |
898 | attributes=False, | |
899 | tag=False, | |
900 | gp=64, | |
901 | reference=0), | |
902 | value=b'test_vector', | |
903 | attributes=None, | |
904 | tag=None, | |
905 | referenced_object=None), | |
906 | attributes=None, | |
907 | tag=None, | |
908 | referenced_object=None), | |
909 | referenced_object=None)) | |
910 | ||
911 | """ | |
912 | view = memoryview(data) | |
913 | ||
914 | filetype = file_type(view) | |
915 | ||
916 | parse_function = ( | |
917 | parse_rdata_binary | |
918 | if filetype in { | |
919 | FileTypes.rdata_binary_v2, | |
920 | FileTypes.rdata_binary_v3, | |
921 | } else parse_data | |
922 | ) | |
923 | ||
924 | if filetype is FileTypes.bzip2: | |
925 | new_data = bz2.decompress(data) | |
926 | elif filetype is FileTypes.gzip: | |
927 | new_data = gzip.decompress(data) | |
928 | elif filetype is FileTypes.xz: | |
929 | new_data = lzma.decompress(data) | |
930 | elif filetype in {FileTypes.rdata_binary_v2, FileTypes.rdata_binary_v3}: | |
931 | view = view[len(magic_dict[filetype]):] | |
932 | new_data = view | |
933 | else: | |
934 | raise NotImplementedError("Unknown file type") | |
935 | ||
936 | return parse_function( | |
937 | new_data, # type: ignore | |
938 | expand_altrep=expand_altrep, | |
939 | altrep_constructor_dict=altrep_constructor_dict, | |
940 | ) | |
941 | ||
942 | ||
943 | def parse_rdata_binary( | |
944 | data: memoryview, | |
945 | expand_altrep: bool = True, | |
946 | altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, | |
947 | ) -> RData: | |
948 | """ | |
949 | Select the appropiate parser and parse all the info. | |
950 | """ | |
951 | format_type = rdata_format(data) | |
952 | ||
953 | if format_type: | |
954 | data = data[len(format_dict[format_type]):] | |
955 | ||
956 | if format_type is RdataFormats.XDR: | |
957 | parser = ParserXDR( | |
958 | data, | |
959 | expand_altrep=expand_altrep, | |
960 | altrep_constructor_dict=altrep_constructor_dict, | |
961 | ) | |
962 | return parser.parse_all() | |
963 | else: | |
964 | raise NotImplementedError("Unknown file format") | |
965 | ||
966 | ||
967 | def bits(data: int, start: int, stop: int) -> int: | |
968 | """ | |
969 | Read bits [start, stop) of an integer. | |
970 | """ | |
971 | count = stop - start | |
972 | mask = ((1 << count) - 1) << start | |
973 | ||
974 | bitvalue = data & mask | |
975 | return bitvalue >> start | |
976 | ||
977 | ||
978 | def is_special_r_object_type(r_object_type: RObjectType) -> bool: | |
979 | """ | |
980 | Check if a R type has a different serialization than the usual one. | |
981 | """ | |
982 | return (r_object_type is RObjectType.NILVALUE | |
983 | or r_object_type is RObjectType.REF) | |
984 | ||
985 | ||
986 | def parse_r_object_info(info_int: int) -> RObjectInfo: | |
987 | """ | |
988 | Parse the internal information of an object. | |
989 | """ | |
990 | type_exp = RObjectType(bits(info_int, 0, 8)) | |
991 | ||
992 | reference = 0 | |
993 | ||
994 | if is_special_r_object_type(type_exp): | |
995 | object_flag = False | |
996 | attributes = False | |
997 | tag = False | |
998 | gp = 0 | |
999 | else: | |
1000 | object_flag = bool(bits(info_int, 8, 9)) | |
1001 | attributes = bool(bits(info_int, 9, 10)) | |
1002 | tag = bool(bits(info_int, 10, 11)) | |
1003 | gp = bits(info_int, 12, 28) | |
1004 | ||
1005 | if type_exp == RObjectType.REF: | |
1006 | reference = bits(info_int, 8, 32) | |
1007 | ||
1008 | return RObjectInfo( | |
1009 | type=type_exp, | |
1010 | object=object_flag, | |
1011 | attributes=attributes, | |
1012 | tag=tag, | |
1013 | gp=gp, | |
1014 | reference=reference | |
1015 | ) |
0 | # Marker file for PEP 561. The rdata package uses inline types.⏎ |
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
Binary diff not shown
0 | import unittest | |
1 | from collections import ChainMap | |
2 | from fractions import Fraction | |
3 | from types import SimpleNamespace | |
4 | from typing import Any, Dict | |
5 | ||
6 | import numpy as np | |
7 | import pandas as pd | |
8 | ||
9 | import rdata | |
10 | ||
11 | TESTDATA_PATH = rdata.TESTDATA_PATH | |
12 | ||
13 | ||
14 | class SimpleTests(unittest.TestCase): | |
15 | ||
16 | def test_opened_file(self) -> None: | |
17 | parsed = rdata.parser.parse_file(open(TESTDATA_PATH / | |
18 | "test_vector.rda")) | |
19 | converted = rdata.conversion.convert(parsed) | |
20 | ||
21 | self.assertIsInstance(converted, dict) | |
22 | ||
23 | def test_opened_string(self) -> None: | |
24 | parsed = rdata.parser.parse_file(str(TESTDATA_PATH / | |
25 | "test_vector.rda")) | |
26 | converted = rdata.conversion.convert(parsed) | |
27 | ||
28 | self.assertIsInstance(converted, dict) | |
29 | ||
30 | def test_logical(self) -> None: | |
31 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_logical.rda") | |
32 | converted = rdata.conversion.convert(parsed) | |
33 | ||
34 | np.testing.assert_equal(converted, { | |
35 | "test_logical": np.array([True, True, False, True, False]) | |
36 | }) | |
37 | ||
38 | def test_vector(self) -> None: | |
39 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_vector.rda") | |
40 | converted = rdata.conversion.convert(parsed) | |
41 | ||
42 | np.testing.assert_equal(converted, { | |
43 | "test_vector": np.array([1., 2., 3.]) | |
44 | }) | |
45 | ||
46 | def test_empty_string(self) -> None: | |
47 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_empty_str.rda") | |
48 | converted = rdata.conversion.convert(parsed) | |
49 | ||
50 | np.testing.assert_equal(converted, { | |
51 | "test_empty_str": [""] | |
52 | }) | |
53 | ||
54 | def test_na_string(self) -> None: | |
55 | parsed = rdata.parser.parse_file( | |
56 | TESTDATA_PATH / "test_na_string.rda") | |
57 | converted = rdata.conversion.convert(parsed) | |
58 | ||
59 | np.testing.assert_equal(converted, { | |
60 | "test_na_string": [None] | |
61 | }) | |
62 | ||
63 | def test_complex(self) -> None: | |
64 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_complex.rda") | |
65 | converted = rdata.conversion.convert(parsed) | |
66 | ||
67 | np.testing.assert_equal(converted, { | |
68 | "test_complex": np.array([1 + 2j, 2, 0, 1 + 3j, -1j]) | |
69 | }) | |
70 | ||
71 | def test_matrix(self) -> None: | |
72 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_matrix.rda") | |
73 | converted = rdata.conversion.convert(parsed) | |
74 | ||
75 | np.testing.assert_equal(converted, { | |
76 | "test_matrix": np.array([[1., 2., 3.], | |
77 | [4., 5., 6.]]) | |
78 | }) | |
79 | ||
80 | def test_list(self) -> None: | |
81 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list.rda") | |
82 | converted = rdata.conversion.convert(parsed) | |
83 | ||
84 | np.testing.assert_equal(converted, { | |
85 | "test_list": | |
86 | [ | |
87 | np.array([1.]), | |
88 | ['a', 'b', 'c'], | |
89 | np.array([2., 3.]), | |
90 | ['hi'] | |
91 | ] | |
92 | }) | |
93 | ||
94 | def test_expression(self) -> None: | |
95 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_expression.rda") | |
96 | converted = rdata.conversion.convert(parsed) | |
97 | ||
98 | np.testing.assert_equal(converted, { | |
99 | "test_expression": rdata.conversion.RExpression([ | |
100 | rdata.conversion.RLanguage(['^', 'base', 'exponent'])]) | |
101 | }) | |
102 | ||
103 | def test_encodings(self) -> None: | |
104 | ||
105 | with self.assertWarns( | |
106 | UserWarning, | |
107 | msg="Unknown encoding. Assumed ASCII." | |
108 | ): | |
109 | parsed = rdata.parser.parse_file( | |
110 | TESTDATA_PATH / "test_encodings.rda", | |
111 | ) | |
112 | converted = rdata.conversion.convert(parsed) | |
113 | ||
114 | np.testing.assert_equal(converted, { | |
115 | "test_encoding_utf8": ["eĥoŝanĝo ĉiuĵaŭde"], | |
116 | "test_encoding_latin1": ["cañón"], | |
117 | "test_encoding_bytes": [b"reba\xf1o"], | |
118 | "test_encoding_latin1_implicit": [b"\xcd\xf1igo"], | |
119 | }) | |
120 | ||
121 | def test_encodings_v3(self) -> None: | |
122 | ||
123 | parsed = rdata.parser.parse_file( | |
124 | TESTDATA_PATH / "test_encodings_v3.rda", | |
125 | ) | |
126 | converted = rdata.conversion.convert(parsed) | |
127 | ||
128 | np.testing.assert_equal(converted, { | |
129 | "test_encoding_utf8": ["eĥoŝanĝo ĉiuĵaŭde"], | |
130 | "test_encoding_latin1": ["cañón"], | |
131 | "test_encoding_bytes": [b"reba\xf1o"], | |
132 | "test_encoding_latin1_implicit": ["Íñigo"], | |
133 | }) | |
134 | ||
135 | def test_dataframe(self) -> None: | |
136 | ||
137 | for f in {"test_dataframe.rda", "test_dataframe_v3.rda"}: | |
138 | with self.subTest(file=f): | |
139 | parsed = rdata.parser.parse_file( | |
140 | TESTDATA_PATH / f, | |
141 | ) | |
142 | converted = rdata.conversion.convert(parsed) | |
143 | ||
144 | pd.testing.assert_frame_equal( | |
145 | converted["test_dataframe"], | |
146 | pd.DataFrame({ | |
147 | "class": pd.Categorical( | |
148 | ["a", "b", "b"]), | |
149 | "value": [1, 2, 3], | |
150 | }) | |
151 | ) | |
152 | ||
153 | def test_ts(self) -> None: | |
154 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_ts.rda") | |
155 | converted = rdata.conversion.convert(parsed) | |
156 | ||
157 | pd.testing.assert_series_equal(converted["test_ts"], | |
158 | pd.Series({ | |
159 | 2000 + Fraction(2, 12): 1., | |
160 | 2000 + Fraction(3, 12): 2., | |
161 | 2000 + Fraction(4, 12): 3., | |
162 | })) | |
163 | ||
164 | def test_s4(self) -> None: | |
165 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_s4.rda") | |
166 | converted = rdata.conversion.convert(parsed) | |
167 | ||
168 | np.testing.assert_equal(converted, { | |
169 | "test_s4": SimpleNamespace( | |
170 | age=np.array(28), | |
171 | name=["Carlos"], | |
172 | **{'class': ["Person"]} | |
173 | ) | |
174 | }) | |
175 | ||
176 | def test_environment(self) -> None: | |
177 | parsed = rdata.parser.parse_file( | |
178 | TESTDATA_PATH / "test_environment.rda") | |
179 | converted = rdata.conversion.convert(parsed) | |
180 | ||
181 | dict_env = {'string': ['test']} | |
182 | empty_global_env: Dict[str, Any] = {} | |
183 | ||
184 | np.testing.assert_equal(converted, { | |
185 | "test_environment": ChainMap(dict_env, ChainMap(empty_global_env)) | |
186 | }) | |
187 | ||
188 | global_env = {"global": "test"} | |
189 | ||
190 | converted_global = rdata.conversion.convert( | |
191 | parsed, | |
192 | global_environment=global_env, | |
193 | ) | |
194 | ||
195 | np.testing.assert_equal(converted_global, { | |
196 | "test_environment": ChainMap(dict_env, ChainMap(global_env)) | |
197 | }) | |
198 | ||
199 | def test_emptyenv(self) -> None: | |
200 | parsed = rdata.parser.parse_file( | |
201 | TESTDATA_PATH / "test_emptyenv.rda") | |
202 | converted = rdata.conversion.convert(parsed) | |
203 | ||
204 | np.testing.assert_equal(converted, { | |
205 | "test_emptyenv": ChainMap({}) | |
206 | }) | |
207 | ||
208 | def test_list_attrs(self) -> None: | |
209 | parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list_attrs.rda") | |
210 | converted = rdata.conversion.convert(parsed) | |
211 | ||
212 | np.testing.assert_equal(converted, { | |
213 | "test_list_attrs": [['list'], [5]] | |
214 | }) | |
215 | ||
216 | def test_altrep_compact_intseq(self) -> None: | |
217 | """Test alternative representation of sequences of ints.""" | |
218 | parsed = rdata.parser.parse_file( | |
219 | TESTDATA_PATH / "test_altrep_compact_intseq.rda", | |
220 | ) | |
221 | converted = rdata.conversion.convert(parsed) | |
222 | ||
223 | np.testing.assert_equal(converted, { | |
224 | "test_altrep_compact_intseq": np.arange(1000), | |
225 | }) | |
226 | ||
227 | def test_altrep_compact_realseq(self) -> None: | |
228 | """Test alternative representation of sequences of ints.""" | |
229 | parsed = rdata.parser.parse_file( | |
230 | TESTDATA_PATH / "test_altrep_compact_realseq.rda", | |
231 | ) | |
232 | converted = rdata.conversion.convert(parsed) | |
233 | ||
234 | np.testing.assert_equal(converted, { | |
235 | "test_altrep_compact_realseq": np.arange(1000.0), | |
236 | }) | |
237 | ||
238 | def test_altrep_deferred_string(self) -> None: | |
239 | """Test alternative representation of deferred strings.""" | |
240 | parsed = rdata.parser.parse_file( | |
241 | TESTDATA_PATH / "test_altrep_deferred_string.rda", | |
242 | ) | |
243 | converted = rdata.conversion.convert(parsed) | |
244 | ||
245 | np.testing.assert_equal(converted, { | |
246 | "test_altrep_deferred_string": [ | |
247 | "1", "2.3", "10000", | |
248 | "1e+05", "-10000", "-1e+05", | |
249 | "0.001", "1e-04", "1e-05", | |
250 | ], | |
251 | }) | |
252 | ||
253 | def test_altrep_wrap_real(self) -> None: | |
254 | """Test alternative representation of wrap_real.""" | |
255 | parsed = rdata.parser.parse_file( | |
256 | TESTDATA_PATH / "test_altrep_wrap_real.rda", | |
257 | ) | |
258 | converted = rdata.conversion.convert(parsed) | |
259 | ||
260 | np.testing.assert_equal(converted, { | |
261 | "test_altrep_wrap_real": [3], | |
262 | }) | |
263 | ||
264 | def test_altrep_wrap_string(self) -> None: | |
265 | """Test alternative representation of wrap_string.""" | |
266 | parsed = rdata.parser.parse_file( | |
267 | TESTDATA_PATH / "test_altrep_wrap_string.rda", | |
268 | ) | |
269 | converted = rdata.conversion.convert(parsed) | |
270 | ||
271 | np.testing.assert_equal(converted, { | |
272 | "test_altrep_wrap_string": ["Hello"], | |
273 | }) | |
274 | ||
275 | def test_altrep_wrap_logical(self) -> None: | |
276 | """Test alternative representation of wrap_logical.""" | |
277 | parsed = rdata.parser.parse_file( | |
278 | TESTDATA_PATH / "test_altrep_wrap_logical.rda", | |
279 | ) | |
280 | converted = rdata.conversion.convert(parsed) | |
281 | ||
282 | np.testing.assert_equal(converted, { | |
283 | "test_altrep_wrap_logical": [True], | |
284 | }) | |
285 | ||
286 | ||
287 | if __name__ == "__main__": | |
288 | # import sys;sys.argv = ['', 'Test.testName'] | |
289 | unittest.main() |
0 | [aliases] | |
1 | test=pytest | |
2 | ||
3 | [tool:pytest] | |
4 | addopts = --doctest-modules --doctest-glob="*.rst" | |
5 | doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS | |
6 | ||
7 | [isort] | |
8 | multi_line_output = 3 | |
9 | include_trailing_comma = true | |
10 | use_parentheses = true | |
11 | combine_as_imports = 1 | |
12 | ||
13 | [mypy] | |
14 | strict = True | |
15 | strict_equality = True | |
16 | implicit_reexport = True | |
17 | ||
18 | [mypy-numpy.*] | |
19 | ignore_missing_imports = True | |
20 | ||
21 | [mypy-pandas.*] | |
22 | ignore_missing_imports = True | |
23 | ||
24 | [mypy-setuptools.*] | |
25 | ignore_missing_imports = True⏎ |
0 | # encoding: utf-8 | |
1 | ||
2 | """ | |
3 | Read R datasets from Python. | |
4 | ||
5 | This package parses .rda datasets used in R. It does not depend on the R | |
6 | language or its libraries, and thus it is released under a MIT license. | |
7 | """ | |
8 | import os | |
9 | import sys | |
10 | ||
11 | from setuptools import find_packages, setup | |
12 | ||
13 | needs_pytest = {'pytest', 'test', 'ptr'}.intersection(sys.argv) | |
14 | pytest_runner = ['pytest-runner'] if needs_pytest else [] | |
15 | ||
16 | DOCLINES = (__doc__ or '').split("\n") | |
17 | ||
18 | with open(os.path.join(os.path.dirname(__file__), | |
19 | 'VERSION'), 'r') as version_file: | |
20 | version = version_file.read().strip() | |
21 | ||
22 | setup(name='rdata', | |
23 | version=version, | |
24 | description=DOCLINES[1], | |
25 | long_description="\n".join(DOCLINES[3:]), | |
26 | url='https://github.com/vnmabus/rdata', | |
27 | author='Carlos Ramos Carreño', | |
28 | author_email='vnmabus@gmail.com', | |
29 | include_package_data=True, | |
30 | platforms=['any'], | |
31 | license='MIT', | |
32 | packages=find_packages(), | |
33 | python_requires='>=3.7, <4', | |
34 | classifiers=[ | |
35 | 'Development Status :: 4 - Beta', | |
36 | 'Intended Audience :: Developers', | |
37 | 'Intended Audience :: Science/Research', | |
38 | 'License :: OSI Approved :: MIT License', | |
39 | 'Natural Language :: English', | |
40 | 'Operating System :: OS Independent', | |
41 | 'Programming Language :: Python :: 3', | |
42 | 'Programming Language :: Python :: 3.6', | |
43 | 'Programming Language :: Python :: 3.7', | |
44 | 'Programming Language :: Python :: 3.8', | |
45 | 'Topic :: Scientific/Engineering :: Mathematics', | |
46 | 'Topic :: Software Development :: Libraries :: Python Modules', | |
47 | 'Typing :: Typed', | |
48 | ], | |
49 | keywords=['rdata', 'r', 'dataset'], | |
50 | install_requires=['numpy', | |
51 | 'xarray', | |
52 | 'pandas'], | |
53 | setup_requires=pytest_runner, | |
54 | tests_require=['pytest-cov', | |
55 | 'numpy>=1.14' # The printing format for numpy changes | |
56 | ], | |
57 | test_suite='rdata.tests', | |
58 | zip_safe=False) |