New upstream version 0.6.0
eamanu
4 years ago
0 | [run] | |
1 | omit = | |
2 | */test_*.py | |
3 | fsspec/_version.py | |
4 | source = | |
5 | fsspec | |
6 | ||
7 | [report] | |
8 | # Regexes for lines to exclude from consideration | |
9 | exclude_lines = | |
10 | pragma: no cover | |
11 | ||
12 | raise AssertionError | |
13 | raise NotImplementedError | |
14 | pass | |
15 | ||
16 | ignore_errors = True |
0 | fsspec/_version.py export-subst |
0 | # Dask | |
1 | dask-worker-space | |
2 | ||
3 | # Byte-compiled / optimized / DLL files | |
4 | __pycache__/ | |
5 | *.py[cod] | |
6 | *$py.class | |
7 | ||
8 | # C extensions | |
9 | *.so | |
10 | ||
11 | # Distribution / packaging | |
12 | .Python | |
13 | env/ | |
14 | build/ | |
15 | develop-eggs/ | |
16 | dist/ | |
17 | downloads/ | |
18 | eggs/ | |
19 | .eggs/ | |
20 | lib/ | |
21 | lib64/ | |
22 | parts/ | |
23 | sdist/ | |
24 | var/ | |
25 | wheels/ | |
26 | *.egg-info/ | |
27 | .installed.cfg | |
28 | *.egg | |
29 | pip-wheel-metadata/ | |
30 | ||
31 | # PyInstaller | |
32 | # Usually these files are written by a python script from a template | |
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. | |
34 | *.manifest | |
35 | *.spec | |
36 | ||
37 | # Installer logs | |
38 | pip-log.txt | |
39 | pip-delete-this-directory.txt | |
40 | ||
41 | # Unit test / coverage reports | |
42 | htmlcov/ | |
43 | .tox/ | |
44 | .coverage | |
45 | .coverage.* | |
46 | .cache | |
47 | nosetests.xml | |
48 | coverage.xml | |
49 | *.cover | |
50 | .hypothesis/ | |
51 | ||
52 | # Translations | |
53 | *.mo | |
54 | *.pot | |
55 | ||
56 | # Django stuff: | |
57 | *.log | |
58 | local_settings.py | |
59 | ||
60 | # Flask stuff: | |
61 | instance/ | |
62 | .webassets-cache | |
63 | ||
64 | # Scrapy stuff: | |
65 | .scrapy | |
66 | ||
67 | # Sphinx documentation | |
68 | docs/_build/ | |
69 | ||
70 | # PyBuilder | |
71 | target/ | |
72 | ||
73 | # Jupyter Notebook | |
74 | .ipynb_checkpoints | |
75 | ||
76 | # pyenv | |
77 | .python-version | |
78 | ||
79 | # celery beat schedule file | |
80 | celerybeat-schedule | |
81 | ||
82 | # SageMath parsed files | |
83 | *.sage.py | |
84 | ||
85 | # dotenv | |
86 | .env | |
87 | ||
88 | # virtualenv | |
89 | .venv | |
90 | venv/ | |
91 | ENV/ | |
92 | .idea/ | |
93 | ||
94 | # Spyder project settings | |
95 | .spyderproject | |
96 | .spyproject | |
97 | ||
98 | # Rope project settings | |
99 | .ropeproject | |
100 | ||
101 | # mkdocs documentation | |
102 | /site | |
103 | ||
104 | # mypy | |
105 | .mypy_cache/ |
0 | exclude: > | |
1 | (?x)^( | |
2 | \.tox/.* | |
3 | )$ | |
4 | default_language_version: | |
5 | python: python3.7 | |
6 | repos: | |
7 | - repo: local | |
8 | hooks: | |
9 | - id: black | |
10 | name: black | |
11 | entry: black | |
12 | language: python | |
13 | require_serial: true | |
14 | types: [python] | |
15 | - repo: https://github.com/pre-commit/pre-commit-hooks | |
16 | rev: v2.3.0 | |
17 | hooks: | |
18 | - id: flake8 |
0 | sudo: required | |
1 | dist: xenial | |
2 | os: | |
3 | - linux | |
4 | services: | |
5 | - docker | |
6 | ||
7 | language: generic | |
8 | env: | |
9 | - TOXENV=py35 | |
10 | - TOXENV=py36 | |
11 | - TOXENV=py37 | |
12 | - TOXENV=coverage | |
13 | - TOXENV=lint | |
14 | - TOXENV=s3fs | |
15 | - TOXENV=gcsfs | |
16 | install: | |
17 | - source ci/install.sh | |
18 | script: | |
19 | - tox -v | |
20 | ||
21 | notifications: | |
22 | email: false |
0 | BSD 3-Clause License | |
1 | ||
2 | Copyright (c) 2018, Martin Durant | |
3 | All rights reserved. | |
4 | ||
5 | Redistribution and use in source and binary forms, with or without | |
6 | modification, are permitted provided that the following conditions are met: | |
7 | ||
8 | * Redistributions of source code must retain the above copyright notice, this | |
9 | list of conditions and the following disclaimer. | |
10 | ||
11 | * Redistributions in binary form must reproduce the above copyright notice, | |
12 | this list of conditions and the following disclaimer in the documentation | |
13 | and/or other materials provided with the distribution. | |
14 | ||
15 | * Neither the name of the copyright holder nor the names of its | |
16 | contributors may be used to endorse or promote products derived from | |
17 | this software without specific prior written permission. | |
18 | ||
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
0 | include versioneer.py | |
1 | include fsspec/_version.py | |
2 | ||
3 | include LICENSE | |
4 | include README.rst | |
5 | include requirements.txt |
0 | # filesystem_spec | |
1 | ||
2 | [![Build Status](https://travis-ci.org/intake/filesystem_spec.svg?branch=master)](https://travis-ci.org/martindurant/filesystem_spec) | |
3 | [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) | |
4 | ||
5 | A specification for pythonic filesystems. | |
6 | ||
7 | ## Install | |
8 | ||
9 | ```bash | |
10 | pip install fsspec | |
11 | ``` | |
12 | or | |
13 | ```bash | |
14 | conda install -c conda-forge fsspec | |
15 | ``` | |
16 | ||
17 | ## Purpose | |
18 | ||
19 | To produce a template or specification for a file-system interface, that specific implementations should follow, | |
20 | so that applications making use of them can rely on a common behaviour and not have to worry about the specific | |
21 | internal implementation decisions with any given backend. Many such implementations are included in this package, | |
22 | or in sister projects such as `s3fs` and `gcsfs`. | |
23 | ||
24 | In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE | |
25 | mounting of the file-system implementation may be available for all implementations "for free". | |
26 | ||
27 | ## Documentation | |
28 | ||
29 | Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) | |
30 | ||
31 | ## Develop | |
32 | ||
33 | fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and | |
34 | [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test | |
35 | environments. First, install conda with tox and tox-conda in a base environment | |
36 | (eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be | |
37 | used to configure a development environment and run tests. | |
38 | ||
39 | First, setup a development conda environment via `tox -e dev`. This will | |
40 | install fspec dependencies, test & dev tools, and install fsspec in develop | |
41 | mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`. | |
42 | ||
43 | ### Testing | |
44 | ||
45 | Tests can be run directly in the activated dev environment via `pytest fsspec`. | |
46 | ||
47 | The full fsspec test suite can be run via `tox`, which will setup and execute | |
48 | tests against multiple dependency versions in isolated environment. Run `tox | |
49 | -av` to list available test environments, select environments via `tox -e <env>`. | |
50 | ||
51 | The full fsspec suite requires a system-level docker, docker-compose, and fuse | |
52 | installation. See `ci/install.sh` for a detailed installation example. | |
53 | ||
54 | ### Code Formatting | |
55 | ||
56 | fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure | |
57 | a consistent code format throughout the project. ``black`` is automatically | |
58 | installed in the tox dev env, activated via `conda activate .tox/dev`. | |
59 | ||
60 | Then, run `black fsspec` from the root of the filesystem_spec repository to | |
61 | auto-format your code. Additionally, many editors have plugins that will apply | |
62 | `black` as you edit files. | |
63 | ||
64 | Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to | |
65 | automatically run `black` when you make a git commit. ``black`` is automatically | |
66 | installed in the tox dev env, activated via `conda activate .tox/dev`. | |
67 | ||
68 | Then, run `pre-commit install --install-hooks` from the root of the | |
69 | filesystem_spec repository to setup pre-commit hooks. `black` will now be run | |
70 | before you commit, reformatting any changed files. You can format without | |
71 | committing via `pre-commit run` or skip these checks with `git commit | |
72 | --no-verify`. |
0 | #!/usr/bin/env bash | |
1 | # https://docs.travis-ci.com/user/docker/#using-docker-compose | |
2 | ||
3 | ||
4 | DOCKER_COMPOSE_VERSION=${DOCKER_COMPOSE_VERSION:-1.23.2} | |
5 | ||
6 | # Install docker | |
7 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - | |
8 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | |
9 | sudo apt-get update | |
10 | sudo apt-get -y -o Dpkg::Options::="--force-confnew" install docker-ce | |
11 | ||
12 | # Update docker-compose | |
13 | sudo rm /usr/local/bin/docker-compose | |
14 | curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose | |
15 | chmod +x docker-compose | |
16 | sudo mv docker-compose /usr/local/bin | |
17 | ||
18 | # install FUSE | |
19 | sudo apt-get install libfuse-dev | |
20 | ||
21 | # install conda | |
22 | source $(dirname $BASH_SOURCE)/install_conda.sh |
0 | #!/usr/bin/env bash | |
1 | ||
2 | # Install conda | |
3 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh | |
4 | bash miniconda.sh -b -p $HOME/miniconda | |
5 | export PATH="$HOME/miniconda/bin:$PATH" | |
6 | conda config --set always_yes yes --set changeps1 no | |
7 | conda update conda | |
8 | conda install -c conda-forge tox tox-conda |
0 | # Minimal makefile for Sphinx documentation | |
1 | # | |
2 | ||
3 | # You can set these variables from the command line. | |
4 | SPHINXOPTS = | |
5 | SPHINXBUILD = sphinx-build | |
6 | SPHINXPROJ = fsspec | |
7 | SOURCEDIR = source | |
8 | BUILDDIR = build | |
9 | ||
10 | # Put it first so that "make" without argument is like "make help". | |
11 | help: | |
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) | |
13 | ||
14 | .PHONY: help Makefile | |
15 | ||
16 | # Catch-all target: route all unknown targets to Sphinx using the new | |
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). | |
18 | %: Makefile | |
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) |
0 | # Building Documentation | |
1 | ||
2 | A basic python environment with packages listed in `./requirements.txt` is | |
3 | required to build the docs, see ``environment.yml``. | |
4 | ||
5 | To make HTML documentation: | |
6 | ||
7 | ```bash | |
8 | make html | |
9 | ``` | |
10 | ||
11 | Outputs to `build/html/index.html` |
0 | name: fsspec | |
1 | channels: | |
2 | - defaults | |
3 | - conda-forge | |
4 | dependencies: | |
5 | - python=3.6 | |
6 | - paramiko | |
7 | - requests | |
8 | - numpydoc |
0 | @ECHO OFF | |
1 | ||
2 | pushd %~dp0 | |
3 | ||
4 | REM Command file for Sphinx documentation | |
5 | ||
6 | if "%SPHINXBUILD%" == "" ( | |
7 | set SPHINXBUILD=sphinx-build | |
8 | ) | |
9 | set SOURCEDIR=source | |
10 | set BUILDDIR=build | |
11 | set SPHINXPROJ=fsspec | |
12 | ||
13 | if "%1" == "" goto help | |
14 | ||
15 | %SPHINXBUILD% >NUL 2>NUL | |
16 | if errorlevel 9009 ( | |
17 | echo. | |
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx | |
19 | echo.installed, then set the SPHINXBUILD environment variable to point | |
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you | |
21 | echo.may add the Sphinx directory to PATH. | |
22 | echo. | |
23 | echo.If you don't have Sphinx installed, grab it from | |
24 | echo.http://sphinx-doc.org/ | |
25 | exit /b 1 | |
26 | ) | |
27 | ||
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% | |
29 | goto end | |
30 | ||
31 | :help | |
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% | |
33 | ||
34 | :end | |
35 | popd |
0 | API Reference | |
1 | ============= | |
2 | ||
3 | .. currentmodule:: fsspec | |
4 | ||
5 | User Functions | |
6 | -------------- | |
7 | ||
8 | .. autosummary:: | |
9 | fsspec.open_files | |
10 | fsspec.open | |
11 | fsspec.filesystem | |
12 | fsspec.get_filesystem_class | |
13 | fsspec.get_mapper | |
14 | fsspec.fuse.run | |
15 | ||
16 | .. autofunction:: fsspec.open_files | |
17 | .. autofunction:: fsspec.open | |
18 | .. autofunction:: fsspec.filesystem | |
19 | .. autofunction:: fsspec.get_filesystem_class | |
20 | .. autofunction:: fsspec.get_mapper | |
21 | .. autofunction:: fsspec.fuse.run | |
22 | ||
23 | Base Classes | |
24 | ------------ | |
25 | ||
26 | .. autosummary:: | |
27 | fsspec.spec.AbstractFileSystem | |
28 | fsspec.spec.Transaction | |
29 | fsspec.spec.AbstractBufferedFile | |
30 | fsspec.FSMap | |
31 | fsspec.core.OpenFile | |
32 | fsspec.core.BaseCache | |
33 | ||
34 | .. autoclass:: fsspec.spec.AbstractFileSystem | |
35 | ||
36 | .. autoclass:: fsspec.spec.Transaction | |
37 | :members: | |
38 | ||
39 | .. autoclass:: fsspec.spec.AbstractBufferedFile | |
40 | :members: | |
41 | ||
42 | .. autoclass:: fsspec.FSMap | |
43 | :members: | |
44 | ||
45 | .. autoclass:: fsspec.core.OpenFile | |
46 | :members: | |
47 | ||
48 | .. autoclass:: fsspec.core.BaseCache | |
49 | :members: | |
50 | ||
51 | ||
52 | .. _implementations: | |
53 | ||
54 | Built-in Implementations | |
55 | ------------------------ | |
56 | ||
57 | .. autosummary:: | |
58 | fsspec.implementations.ftp.FTPFileSystem | |
59 | fsspec.implementations.hdfs.PyArrowHDFS | |
60 | fsspec.implementations.http.HTTPFileSystem | |
61 | fsspec.implementations.local.LocalFileSystem | |
62 | fsspec.implementations.memory.MemoryFileSystem | |
63 | fsspec.implementations.sftp.SFTPFileSystem | |
64 | fsspec.implementations.webhdfs.WebHDFS | |
65 | fsspec.implementations.zip.ZipFileSystem | |
66 | fsspec.implementations.cached.CachingFileSystem | |
67 | fsspec.implementations.cached.WholeFileCacheFileSystem | |
68 | ||
69 | .. autoclass:: fsspec.implementations.ftp.FTPFileSystem | |
70 | :members: __init__ | |
71 | ||
72 | .. autoclass:: fsspec.implementations.hdfs.PyArrowHDFS | |
73 | :members: __init__ | |
74 | ||
75 | .. autoclass:: fsspec.implementations.http.HTTPFileSystem | |
76 | :members: __init__ | |
77 | ||
78 | .. autoclass:: fsspec.implementations.local.LocalFileSystem | |
79 | :members: | |
80 | ||
81 | .. autoclass:: fsspec.implementations.memory.MemoryFileSystem | |
82 | :members: __init__ | |
83 | ||
84 | .. autoclass:: fsspec.implementations.sftp.SFTPFileSystem | |
85 | :members: __init__ | |
86 | ||
87 | .. autoclass:: fsspec.implementations.webhdfs.WebHDFS | |
88 | :members: __init__ | |
89 | ||
90 | .. autoclass:: fsspec.implementations.zip.ZipFileSystem | |
91 | :members: __init__ | |
92 | ||
93 | .. autoclass:: fsspec.implementations.cached.CachingFileSystem | |
94 | :members: __init__ | |
95 | ||
96 | .. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem | |
97 | ||
98 | .. _readbuffering: | |
99 | ||
100 | Read Buffering | |
101 | -------------- | |
102 | ||
103 | .. autosummary:: | |
104 | ||
105 | fsspec.caching.ReadAheadCache | |
106 | fsspec.caching.BytesCache | |
107 | fsspec.caching.MMapCache | |
108 | fsspec.caching.BlockCache | |
109 | ||
110 | .. autoclass:: fsspec.caching.ReadAheadCache | |
111 | :members: | |
112 | ||
113 | .. autoclass:: fsspec.caching.BytesCache | |
114 | :members: | |
115 | ||
116 | .. autoclass:: fsspec.caching.MMapCache | |
117 | :members: | |
118 | ||
119 | .. autoclass:: fsspec.caching.BlockCache | |
120 | :members: |
0 | Changelog | |
1 | ========= | |
2 | ||
3 | Version 0.6.0 | |
4 | ------------- | |
5 | ||
6 | * Fixed issues with filesystem instance caching. This was causing authorization errors | |
7 | in downstream libraries like ``gcsfs`` and ``s3fs`` in multi-threaded code (:pr:`155`, :pr:`181`) | |
8 | * Changed the default file caching strategy to :class:`fsspec.caching.ReadAheadCache` (:pr:`193`) | |
9 | * Moved file caches to the new ``fsspec.caching`` module. They're still available from | |
10 | their old location in ``fsspec.core``, but we recommend using the new location for new code (:pr:`195`) | |
11 | * Added a new file caching strategy, :class:`fsspec.caching.BlockCache` for fetching and caching | |
12 | file reads in blocks (:pr:`191`). | |
13 | * Fixed equality checks for file system instance to return ``False`` when compared to objects | |
14 | other than file systems (:pr:`192`) | |
15 | * Fixed a bug in :meth:`fsspec.FSMap.keys` returning a generator, which was consumed upon iteration (:pr:`189`). | |
16 | * Removed the magic addition of aliases in ``AbstractFileSystem.__init__``. Now alias methods are always | |
17 | present (:pr:`177`) | |
18 | * Deprecated passing ``trim`` to :class:`fsspec.spec.AbstractBufferedFile`. Pass it in ``storage_options`` instead (:pr:`188`) | |
19 | * Improved handling of requests for :class:`fsspec.implementations.http.HTTPFileSystem` when the | |
20 | HTTP server responds with an (incorrect) content-length of 0 (:pr:`163`) | |
21 | * Added a ``detail=True`` parameter to :meth:`fsspec.spec.AbstractFileSystem.ls` (:pr:`168`) | |
22 | * Fixed handling of UNC/DFS paths (:issue:`154`)⏎ |
0 | # -*- coding: utf-8 -*- | |
1 | # | |
2 | # fsspec documentation build configuration file, created by | |
3 | # sphinx-quickstart on Mon Jan 15 18:11:02 2018. | |
4 | # | |
5 | # This file is execfile()d with the current directory set to its | |
6 | # containing dir. | |
7 | # | |
8 | # Note that not all possible configuration values are present in this | |
9 | # autogenerated file. | |
10 | # | |
11 | # All configuration values have a default; values that are commented out | |
12 | # serve to show the default. | |
13 | ||
14 | # If extensions (or modules to document with autodoc) are in another directory, | |
15 | # add these directories to sys.path here. If the directory is relative to the | |
16 | # documentation root, use os.path.abspath to make it absolute, like shown here. | |
17 | # | |
18 | import os | |
19 | import sys | |
20 | ||
21 | sys.path.insert(0, os.path.abspath("../..")) | |
22 | ||
23 | ||
24 | # -- General configuration ------------------------------------------------ | |
25 | ||
26 | # If your documentation needs a minimal Sphinx version, state it here. | |
27 | # | |
28 | # needs_sphinx = '1.0' | |
29 | ||
30 | # Add any Sphinx extension module names here, as strings. They can be | |
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom | |
32 | # ones. | |
33 | extensions = [ | |
34 | "sphinx.ext.autodoc", | |
35 | "sphinx.ext.viewcode", | |
36 | "sphinx.ext.autosummary", | |
37 | "sphinx.ext.extlinks", | |
38 | "numpydoc", | |
39 | ] | |
40 | ||
41 | # Add any paths that contain templates here, relative to this directory. | |
42 | templates_path = ["_templates"] | |
43 | ||
44 | # The suffix(es) of source filenames. | |
45 | # You can specify multiple suffix as a list of string: | |
46 | # | |
47 | # source_suffix = ['.rst', '.md'] | |
48 | source_suffix = ".rst" | |
49 | ||
50 | # The master toctree document. | |
51 | master_doc = "index" | |
52 | ||
53 | # General information about the project. | |
54 | project = "fsspec" | |
55 | copyright = "2018, Martin Durant" | |
56 | author = "Martin Durant" | |
57 | ||
58 | # The version info for the project you're documenting, acts as replacement for | |
59 | # |version| and |release|, also used in various other places throughout the | |
60 | # built documents. | |
61 | # | |
62 | # The short X.Y version. | |
63 | import fsspec | |
64 | ||
65 | version = fsspec.__version__ | |
66 | # The full version, including alpha/beta/rc tags. | |
67 | release = fsspec.__version__ | |
68 | ||
69 | # The language for content autogenerated by Sphinx. Refer to documentation | |
70 | # for a list of supported languages. | |
71 | # | |
72 | # This is also used if you do content translation via gettext catalogs. | |
73 | # Usually you set "language" from the command line for these cases. | |
74 | language = None | |
75 | ||
76 | # List of patterns, relative to source directory, that match files and | |
77 | # directories to ignore when looking for source files. | |
78 | # This patterns also effect to html_static_path and html_extra_path | |
79 | exclude_patterns = [] | |
80 | ||
81 | # The name of the Pygments (syntax highlighting) style to use. | |
82 | pygments_style = "sphinx" | |
83 | ||
84 | # If true, `todo` and `todoList` produce output, else they produce nothing. | |
85 | todo_include_todos = False | |
86 | ||
87 | ||
88 | # -- Options for HTML output ---------------------------------------------- | |
89 | ||
90 | # The theme to use for HTML and HTML Help pages. See the documentation for | |
91 | # a list of builtin themes. | |
92 | # | |
93 | html_theme = "sphinx_rtd_theme" | |
94 | ||
95 | # Theme options are theme-specific and customize the look and feel of a theme | |
96 | # further. For a list of options available for each theme, see the | |
97 | # documentation. | |
98 | # | |
99 | # html_theme_options = {} | |
100 | ||
101 | # Add any paths that contain custom static files (such as style sheets) here, | |
102 | # relative to this directory. They are copied after the builtin static files, | |
103 | # so a file named "default.css" will overwrite the builtin "default.css". | |
104 | html_static_path = [] | |
105 | ||
106 | # Custom sidebar templates, must be a dictionary that maps document names | |
107 | # to template names. | |
108 | # | |
109 | # This is required for the alabaster theme | |
110 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars | |
111 | html_sidebars = { | |
112 | "**": [ | |
113 | "relations.html", # needs 'show_related': True theme option to display | |
114 | "searchbox.html", | |
115 | ] | |
116 | } | |
117 | ||
118 | ||
119 | # -- Options for HTMLHelp output ------------------------------------------ | |
120 | ||
121 | # Output file base name for HTML help builder. | |
122 | htmlhelp_basename = "fsspecdoc" | |
123 | ||
124 | ||
125 | # -- Options for LaTeX output --------------------------------------------- | |
126 | ||
127 | latex_elements = { | |
128 | # The paper size ('letterpaper' or 'a4paper'). | |
129 | # | |
130 | # 'papersize': 'letterpaper', | |
131 | # The font size ('10pt', '11pt' or '12pt'). | |
132 | # | |
133 | # 'pointsize': '10pt', | |
134 | # Additional stuff for the LaTeX preamble. | |
135 | # | |
136 | # 'preamble': '', | |
137 | # Latex figure (float) alignment | |
138 | # | |
139 | # 'figure_align': 'htbp', | |
140 | } | |
141 | ||
142 | # Grouping the document tree into LaTeX files. List of tuples | |
143 | # (source start file, target name, title, | |
144 | # author, documentclass [howto, manual, or own class]). | |
145 | latex_documents = [ | |
146 | (master_doc, "fsspec.tex", "fsspec Documentation", "Joseph Crail", "manual") | |
147 | ] | |
148 | ||
149 | ||
150 | # -- Options for manual page output --------------------------------------- | |
151 | ||
152 | # One entry per manual page. List of tuples | |
153 | # (source start file, name, description, authors, manual section). | |
154 | man_pages = [(master_doc, "fsspec", "fsspec Documentation", [author], 1)] | |
155 | ||
156 | ||
157 | # -- Options for Texinfo output ------------------------------------------- | |
158 | ||
159 | # Grouping the document tree into Texinfo files. List of tuples | |
160 | # (source start file, target name, title, author, | |
161 | # dir menu entry, description, category) | |
162 | texinfo_documents = [ | |
163 | ( | |
164 | master_doc, | |
165 | "fsspec", | |
166 | "fsspec Documentation", | |
167 | author, | |
168 | "fsspec", | |
169 | "One line description of project.", | |
170 | "Miscellaneous", | |
171 | ) | |
172 | ] | |
173 | ||
174 | extlinks = { | |
175 | "issue": ("https://github.com/intake/filesystem_spec/issues/%s", "GH#"), | |
176 | "pr": ("https://github.com/intake/filesystem_spec/pull/%s", "GH#"), | |
177 | } |
0 | Features of fsspec | |
1 | ================== | |
2 | ||
3 | Consistent API to many different storage backends. The general API and functionality were | |
4 | proven with the projects `s3fs`_ and `gcsfs`_ (along with `hdfs3`_ and `adlfs`_), within the | |
5 | context of Dask and independently. These have been tried and tested by many users and shown their | |
6 | usefulness over some years. ``fsspec`` aims to build on these and unify their models, as well | |
7 | as extract out file-system handling code from Dask which does not so comfortably fit within a | |
8 | library designed for task-graph creation and their scheduling. | |
9 | ||
10 | .. _s3fs: https://s3fs.readthedocs.io/en/latest/ | |
11 | .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/ | |
12 | .. _hdfs3: https://hdfs3.readthedocs.io/en/latest/ | |
13 | .. _adlfs: https://azure-datalake-store.readthedocs.io/en/latest/ | |
14 | ||
15 | Here follows a brief description of some features of note of ``fsspec`` that promide to make | |
16 | it an interesting project beyond some other file-system abstractions | |
17 | ||
18 | Serialisability | |
19 | --------------- | |
20 | ||
21 | Coming out of the Dask stable, it was an important design decision that file-system instances | |
22 | be serialisable, so that they could be created in one process (e.g., the client) and used in | |
23 | other processes (typically the workers). These other processes may even be on other machines, | |
24 | so in many cases they would need to be able to re-establish credentials, ideally without passing | |
25 | sensitive tokens in the pickled binary data. | |
26 | ||
27 | ``fsspec`` instances, generally speaking, abide by these rules, do not include locks, files and other | |
28 | thread-local material, and where possible, use local credentials (such as a token file) | |
29 | for re-establishing sessions upon de-serialisation. (While making use of cached instances, where | |
30 | they exist, see below). | |
31 | ||
32 | ``OpenFile`` instances | |
33 | ---------------------- | |
34 | ||
35 | The :func:`fsspec.core.OpenFile` class provides a convenient way to prescribe the manner to | |
36 | open some file (local, | |
37 | remote, in a compressed store, etc.) which is portable, and ca also apply any compression and | |
38 | text-mode to the file. These instances are also serialisable, because the do not contain any open | |
39 | files. | |
40 | ||
41 | The way to work with ``OpenFile`` s is to isolate interaction with in a ``with`` context. It is | |
42 | the initiation of the context which actually does the work of creating file-like instances. | |
43 | ||
44 | .. code-block:: python | |
45 | ||
46 | of = fsspec.open(url, ...) | |
47 | # of is just a place-holder | |
48 | with of as f: | |
49 | # f is now a real file-like object holding resources | |
50 | f.read(...) | |
51 | ||
52 | Random Access and Buffering | |
53 | --------------------------- | |
54 | ||
55 | The :func:`fsspec.spec.AbstractBufferedFile` class is provided as an easy way to build file-like | |
56 | interfaces to some service which is capable of providing blocks of bytes. This class is derived | |
57 | from in a number of the existing implementations. A subclass of ``AbstractBufferedFile`` provides | |
58 | random access for the underlying file-like data (without downloading the whole thing) and | |
59 | configurable read-ahead buffers to minimise the number of the read operations that need to be | |
60 | performed on the back-end storage. | |
61 | ||
62 | This is also a critical feature in the big-data access model, where each sub-task of an operation | |
63 | may need on a small part of a file, and does not, therefore want to be forces into downloading the | |
64 | whole thing. | |
65 | ||
66 | Transparent text-mode and compression | |
67 | ------------------------------------- | |
68 | ||
69 | As mentioned above, the ``OpenFile`` class allows for the opening of files on a binary store, | |
70 | which appear to be in text mode and/or allow for a compression/decompression layer between the | |
71 | caller and the back-end storage system. From the user's point of view, this is achieved simply | |
72 | by passing arguments to the :func:`fsspec.open_files` or :func:`fsspec.open` functions, and | |
73 | thereafter happens transparently. | |
74 | ||
75 | Key-value stores | |
76 | ---------------- | |
77 | ||
78 | File-systems are naturally like dict-like key-value mappings: each (string) path corresponds to some | |
79 | binary data on the storage back-end. For some use-cases, it is very convenient to be able to | |
80 | view some path within the file-system as a dict-like store, and the function :func:`fsspec.get_mapper` | |
81 | gives a one-stop way to return such an object. This has become useful, for example, in the | |
82 | context of the `zarr`_ project, which stores it array chunks in keys in any arbitrary mapping-like | |
83 | object. | |
84 | ||
85 | .. code-block:: python | |
86 | ||
87 | mapper = fsspec.get_mapper('protocol://server/path', args) | |
88 | list(mapper) | |
89 | mapper[k] = b'some data' | |
90 | ||
91 | .. _zarr: https://zarr.readthedocs.io/en/stable/ | |
92 | ||
93 | PyArrow integration | |
94 | ------------------- | |
95 | ||
96 | `pyarrow`_ has its own internal idea of what a file-system is (``pyarrow.filesystem.FileSystem``), | |
97 | and some functions, particularly the loading of parquet, require that the target be compatible. | |
98 | As it happens, the design of the file-system interface in ``pyarrow`` *is* compatible with `fsspec` | |
99 | (this is not by accident). Therefore at import time, ``fsspec`` checks for the existence of | |
100 | ``pyarrow``, and, if found, adds it to the superclasses of the spec base-class. In this manner, | |
101 | all ``fsspec``-derived file-systems are also pyarrow file-systems, and can be used by pyarrow | |
102 | functions. | |
103 | ||
104 | .. _pyarrow: https://arrow.apache.org/docs/python/ | |
105 | ||
106 | Transactions | |
107 | ------------ | |
108 | ||
109 | ``fsspec`` supports *transactions*, during which writing to files on a remote store are deferred | |
110 | (typically put into a temporary location) until the transaction is over, whereupon the whole | |
111 | transaction is finalised in a semi-atomic way, and all the files are moved/committed to their | |
112 | final destination. The implementation of the details is file-system specific (and not all | |
113 | support it yet), but the idea is, | |
114 | that all files should get written or none, to mitigate against data corruption. The feature | |
115 | can be used like | |
116 | ||
117 | .. code-block:: python | |
118 | ||
119 | fs = fsspec.filesystem(...) | |
120 | with fs.transation: | |
121 | with fs.open('file1', 'wb') as f: | |
122 | f.write(b'some data') | |
123 | with fs.open('file2', 'wb') as f: | |
124 | f.write(b'more data') | |
125 | ||
126 | Here, files 1 and 2 do not get moved to the target location until the transaction context finishes. | |
127 | If the context finishes due to an (uncaught) exception, then the files are discarded and the | |
128 | file target locations untouched. | |
129 | ||
130 | The class :func:`fsspec.spec.Transaction` allows for fine-tuning of the operation, and every | |
131 | ``fsspec`` instance has an instance of this as an attribute ``.transaction`` to give access. | |
132 | ||
133 | Note that synchronising transactions across multiple instances, perhaps across a cluster, | |
134 | is a harder problem to solve, and the implementation described here is only part of the solution. | |
135 | ||
136 | Mount anything with FUSE | |
137 | ------------------------ | |
138 | ||
139 | Any path of any file-system can be mapped to a local directory using pyfuse and | |
140 | :func:`sspec.fuse.run`. This feature is experimental, but basic file listing with | |
141 | details, and read/write should generally be available to the extent that the | |
142 | remote file-system provides enough information. Naturally, if a file-system is read-only, | |
143 | then write operations will fail - but they will tend to fail late and with obscure | |
144 | error messages such as "bad address". | |
145 | ||
146 | Some specific quirks of some file-systems may cause confusion for FUSE. For example, | |
147 | it is possible for a given path on s3 to be both a valid key (i.e., containing binary | |
148 | data, like a file) and a valid prefix (i.e., can be listed to find subkeys, like a | |
149 | directory). Since this breaks the assumptions of a normal file-system, it may not | |
150 | be possible to reach all paths on the remote. | |
151 | ||
152 | Instance Caching | |
153 | ---------------- | |
154 | ||
155 | In a file-system implementation class is marked as *cachable* (attribute ``.cachable``), | |
156 | then its instances will | |
157 | get stored in a class attribute, to enable quick look-up instead of needing to regenerate | |
158 | potentially expensive connections and sessions. They key in the cache is a tokenisation of | |
159 | the arguments to create the instance. The cache itself (attribute ``._cache``) | |
160 | is currently a simple dict, but could in the future be LRU, or something more complicated, | |
161 | to fine-tune instance lifetimes. | |
162 | ||
163 | Since files can hold on to write caches and read buffers, | |
164 | the instance cache may cause excessive memory usage in some situations; but normally, files | |
165 | will get ``close``d, and the data discarded. Only when there is also an unfinalised transaction or | |
166 | captured traceback might this be anticipated becoming a problem. | |
167 | ||
168 | File Buffering | |
169 | -------------- | |
170 | ||
171 | Most implementations create file objects which derive from ``fsspec.spec.AbstractBufferedFile``, and | |
172 | have many behaviours in common. These files offer buffering of both read and write operations, so that | |
173 | communication with the remote resource is limited. The size of the buffer is generally configured | |
174 | with the ``blocksize=`` kwargs at p[en time, although the implementation may have some minimum or | |
175 | maximum sizes that need to be respected. | |
176 | ||
177 | For reading, a number of buffering schemes are available, listed in ``fsspec.caching.caches`` | |
178 | (see :ref:`readbuffering`), or "none" for no buffering at all, e.g., for a simple read-ahead | |
179 | buffer, you can do | |
180 | ||
181 | .. code-block:: python | |
182 | ||
183 | fs = fsspec.filesystem(...) | |
184 | with fs.open(path, mode='rb', cache_type='readahead') as f: | |
185 | use_for_something(f) | |
186 | ||
187 | Caching Files Locally | |
188 | --------------------- | |
189 | ||
190 | ``fsspec`` allows you to access data on remote file systems, that is its purpose. However, such | |
191 | access can often be rather slow compared to local storage, so as well as buffering (see above), the | |
192 | option exists to cp[y files locally when you first access them, and thereafter to use the local data. | |
193 | This local cache of data might be temporary (i.e., attached to the process and discarded when the | |
194 | process ends) or at some specific location in your local storage. | |
195 | ||
196 | Two mechanisms are provided, and both involve wrapping a `target` filesystem. The following example | |
197 | creates a file-based cache. | |
198 | ||
199 | .. code-block:: python | |
200 | ||
201 | fs = fsspec.filesystem("filecache", target_protocol='s3', target_options={'anon': True}, | |
202 | cache_storage='/tmp/files/') | |
203 | ||
204 | Each time you open a remote file on S3, it will first copy it to | |
205 | a local temporary directory, and then all further access will use the local file. Since we specify | |
206 | a particular local location, the files will persist and can be reused from future sessions, although | |
207 | you can also set policies to have cached files expire after some time, or to check the remote file system | |
208 | on each open, to see if the target file has changed since it was copied. | |
209 | ||
210 | With the "blockcache" variant, data is downloaded block-wise: only the specific parts of the remote file | |
211 | which are accessed. This means that the local copy of the file might end up being much smaller than the | |
212 | remote one, if only certain parts of it are required. | |
213 | ||
214 | Whereas "filecache" works for all file system implementations, and provides a real local file for other | |
215 | libraries to use, "blockcache" has restrictions: that you have a storage/OS combination which supports | |
216 | sparse files, that the backend implementation uses files which derive ``from AbstractBufferedFile``, | |
217 | and that the library you pass the resultant object to accepts generic python file-like objects. You | |
218 | should not mix block- and file-caches in the same directory. |
0 | fsspec's: python filesystem interfaces | |
1 | ====================================== | |
2 | ||
3 | Filesystem Spec is a project to unify various projects and classes to work with remote filesystems and | |
4 | file-system-like abstractions using a standard pythonic interface. | |
5 | ||
6 | ||
7 | .. _highlight: | |
8 | ||
9 | Highlights | |
10 | ---------- | |
11 | ||
12 | - based on s3fs and gcsfs | |
13 | - ``fsspec`` instances are serializable and can be passed between processes/machines | |
14 | - the ``OpenFiles`` file-like instances are also serializable | |
15 | - implementations provide random access, to enable only the part of a file required to be read; plus a template | |
16 | to base other file-like classes on | |
17 | - file access can use transparent compression and text-mode | |
18 | - any file-system directory can be viewed as a key-value/mapping store | |
19 | - if installed, all file-system classes also subclass from ``pyarrow.filesystem.FileSystem``, so | |
20 | can work with any arrow function expecting such an instance | |
21 | - writes can be transactional: stored in a temporary location and only moved to the final | |
22 | destination when the transaction is committed | |
23 | - FUSE: mount any path from any backend to a point on your file-system | |
24 | - cached instances tokenised on the instance parameters | |
25 | ||
26 | These are described further in the :doc:`features` section. | |
27 | ||
28 | Installation | |
29 | ------------ | |
30 | ||
31 | pip install fsspec | |
32 | ||
33 | or | |
34 | ||
35 | conda install -c conda-forge fsspec | |
36 | ||
37 | Implementations | |
38 | --------------- | |
39 | ||
40 | This repo contains several file-system implementations, see :ref:`implementations`. However, | |
41 | the external projects ``s3fs`` and ``gcsfs`` depend on ``fsspec`` and share the same behaviours. | |
42 | ``Dask`` and ``Intake`` use ``fsspec`` internally for their IO needs. | |
43 | ||
44 | The current list of known implementations can be found as follows | |
45 | ||
46 | .. code-block:: python | |
47 | ||
48 | from fsspec.registry import known_implementations | |
49 | known_implementations | |
50 | ||
51 | These are only imported on request, which may fail if a required dependency is missing. The dictionary | |
52 | ``fsspec.registry`` contains all imported implementations, and can be mutated by user code, if necessary. | |
53 | ||
54 | ||
55 | .. toctree:: | |
56 | :maxdepth: 2 | |
57 | :caption: Contents: | |
58 | ||
59 | intro.rst | |
60 | usage.rst | |
61 | features.rst | |
62 | api.rst | |
63 | changelog.rst | |
64 | ||
65 | ||
66 | Indices and tables | |
67 | ================== | |
68 | ||
69 | * :ref:`genindex` | |
70 | * :ref:`modindex` | |
71 | * :ref:`search` |
0 | Introduction | |
1 | ============ | |
2 | ||
3 | To get stuck into using the package, rather than reading about its philosophy and history, you can | |
4 | skip to :doc:`usage`. | |
5 | ||
6 | Background | |
7 | ---------- | |
8 | ||
9 | Python provides a standard interface for open files, so that alternate implementations of file-like object can | |
10 | work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries | |
11 | have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system | |
12 | which may be local, structured data store or some remote service. | |
13 | ||
14 | This repository is intended to be a place to define a standard interface that such file-systems should adhere to, | |
15 | such that code using them should not have to know the details of the implementation in order to operate on any of | |
16 | a number of backends. With hope, the community can come together to | |
17 | define an interface that is the best for the highest number of users, and having the specification, makes developing | |
18 | other file-system implementations simpler. | |
19 | ||
20 | History | |
21 | ------- | |
22 | ||
23 | I (Martin Durant) have been involved in building a number of remote-data file-system implementations, principally | |
24 | in the context of the `Dask`_ project. In particular, several are listed | |
25 | in `docs`_ with links to the specific repositories. | |
26 | With common authorship, there is much that is similar between the implementations, for example posix-like naming | |
27 | of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic | |
28 | URLs in order to select amongst them. However, *some* extra code was required in each case to adapt the peculiarities | |
29 | of each implementation with the generic usage that Dask demanded. People may find the | |
30 | `code`_ which parses URLs and creates file-system | |
31 | instances interesting. | |
32 | ||
33 | .. _Dask: http://dask.pydata.org/en/latest/ | |
34 | .. _docs: http://dask.pydata.org/en/latest/remote-data-services.html | |
35 | .. _code: https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266 | |
36 | ||
37 | At the same time, the Apache `Arrow`_ project was also concerned with a similar problem, | |
38 | particularly a common interface to local and HDFS files, for example the | |
39 | `hdfs`_ interface (which actually communicated with HDFS | |
40 | with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able | |
41 | to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a | |
42 | `conversation`_ | |
43 | was started, and I invite all interested parties to continue the conversation in this location. | |
44 | ||
45 | .. _Arrow: https://arrow.apache.org/ | |
46 | .. _hdfs: https://arrow.apache.org/docs/python/filesystems.html | |
47 | .. _conversation: https://github.com/dask/dask/issues/2880 | |
48 | ||
49 | There is a good argument that this type of code has no place in Dask, which is concerned with making graphs | |
50 | representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful, | |
51 | and each has a user-base wider than just those that work via Dask. | |
52 | ||
53 | Influences | |
54 | ---------- | |
55 | ||
56 | The following places to consider, when choosing the definitions of how we would like the file-system specification | |
57 | to look: | |
58 | ||
59 | - python's `os`_ module and its `path` namespace; also other file-connected | |
60 | functionality in the standard library | |
61 | - posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants | |
62 | - the existing implementations for the various backends (e.g., | |
63 | `gcsfs`_ or Arrow's | |
64 | `hdfs`_) | |
65 | - `pyfilesystems`_, an attempt to do something similar, with a | |
66 | plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out | |
67 | validation code. | |
68 | ||
69 | .. _os: https://docs.python.org/3/library/os.html | |
70 | .. _gcsfs: http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem | |
71 | .. _pyfilesystems: https://docs.pyfilesystem.org/en/latest/index.html | |
72 | ||
73 | Not pyfilesystems? | |
74 | ------------------ | |
75 | ||
76 | It might have been conceivable to reuse code in ``pyfilesystems``, which has an established interface and several | |
77 | implementations of its own. However, it supports none of the :ref:`highlight`, critical to | |
78 | cloud and parallel access, and would not be easy to | |
79 | coerce. Following on the success of ``s3fs`` and ``gcsfs``, and their use within Dask, it seemed best to | |
80 | have an interface as close to those as possible. See a | |
81 | `discussion`_ on the topic. | |
82 | ||
83 | .. _discussion: https://github.com/intake/filesystem_spec/issues/5 | |
84 | ||
85 | Structure of the package | |
86 | ------------------------ | |
87 | ||
88 | The best place to get a feel for the contents of ``fsspec`` is by looking through the :doc:`usage` and | |
89 | :doc:`api` sections. In addition, the source code will be interesting for those who wish to subclass and | |
90 | develop new file-system implementations. ``fsspec/spec.py`` contains the main abstract file-system class | |
91 | to derive from, ``AbstractFileSystem``. | |
92 | ||
93 | .. _zarr: https://zarr.readthedocs.io |
0 | Usage | |
1 | ===== | |
2 | ||
3 | This is quick-start documentation to help people get familiar with the layout and functioning of ``fsspec``. | |
4 | ||
5 | Instantiate a file-system | |
6 | ------------------------- | |
7 | ||
8 | ``fsspec`` provides an abstract file-system interface as a template for other filesystems. In this context, | |
9 | "interface" means an API for working with files on the given file-system, which can mean files on some | |
10 | remote store, local files, files within some wrapper, or anything else that is capable of producing | |
11 | file-like objects. | |
12 | ||
13 | Some concrete implementations are bundled with ``fsspec`` and others can be installed separately. They | |
14 | can be instantiated directly, or the `registry` can be used to find them. | |
15 | ||
16 | Direct instantiation: | |
17 | ||
18 | .. code-block:: python | |
19 | ||
20 | from fsspec.implementations.local import LocalFileSystem | |
21 | fs = LocalFileSystem() | |
22 | ||
23 | Look-up via registry: | |
24 | ||
25 | .. code-block:: python | |
26 | ||
27 | import fsspec | |
28 | fs = fsspec.filesystem('file') | |
29 | ||
30 | Many filesystems also take extra parameters, some of which may be options - see :doc:`api`. | |
31 | ||
32 | .. code-block:: python | |
33 | ||
34 | import fsspec | |
35 | fs = fsspec.filesystem('ftp', host=host, port=port, | |
36 | username=user, password=pw) | |
37 | ||
38 | Use a file-system | |
39 | ----------------- | |
40 | ||
41 | File-system instances offer a large number of methods for getting information about and manipulating files | |
42 | for the given back-end. Although some specific implementations may not offer all features (e.g., ``http`` | |
43 | is read-only), generally all normal operations, such as ``ls``, ``rm``, should be expected to work (see the | |
44 | full list: :class:`fsspec.spec.AbstractFileSystem`). | |
45 | Note that this quick-start will prefer posix-style naming, but | |
46 | many common operations are aliased: ``cp()`` and ``copy()`` are identical, for instance. | |
47 | Functionality is generally chosen to be as close to the builtin ``os`` module's working for things like | |
48 | ``glob`` as possible. | |
49 | ||
50 | The ``open()`` method will return a file-like object which can be passed to any other library that expects | |
51 | to work with python files. These will normally be binary-mode only, but may implement internal buffering | |
52 | in order to limit the number of reads from a remote source. They respect the use of ``with`` contexts. If | |
53 | you have ``pandas`` installed, for example, you can do the following: | |
54 | ||
55 | .. code-block:: python | |
56 | ||
57 | with fs.open('https://raw.githubusercontent.com/dask/' | |
58 | 'fastparquet/master/test-data/nation.csv') as f: | |
59 | df = pd.read_csv(f, sep='|', header=None) | |
60 | ||
61 | Higher-level | |
62 | ------------ | |
63 | ||
64 | For many situations, the only function that will be needed is :func:`fsspec.open_files()`, which will return | |
65 | :class:`fsspec.core.OpenFile` instances created from a single URL and parameters to pass to the backend. | |
66 | This supports text-mode and compression on the fly, and the objects can be serialized for passing between | |
67 | processes or machines (so long as each has access to the same backend file-system). The protocol (i.e., | |
68 | backend) is inferred from the URL passed, and glob characters are expanded in read mode (search for files) | |
69 | or write mode (create names). Critically, the file on the backend system is not actually opened until the | |
70 | ``OpenFile`` instance is used in a ``with`` context. For the example above: | |
71 | ||
72 | .. code-block:: python | |
73 | ||
74 | of = fsspec.open('https://raw.githubusercontent.com/dask/' | |
75 | 'fastparquet/master/test-data/nation.csv', mode='r') | |
76 | # files is a not-yet-open OpenFile object. The "with" context actually opens it | |
77 | with of as f: | |
78 | # now f is a text-mode file | |
79 | df = pd.read_csv(f, sep='|', header=None) | |
80 |
0 | from ._version import get_versions | |
1 | ||
2 | from .spec import AbstractFileSystem | |
3 | from .registry import get_filesystem_class, registry, filesystem | |
4 | from .mapping import FSMap, get_mapper | |
5 | from .core import open_files, get_fs_token_paths, open | |
6 | from . import caching | |
7 | ||
8 | __version__ = get_versions()["version"] | |
9 | del get_versions | |
10 | ||
11 | ||
12 | __all__ = [ | |
13 | "AbstractFileSystem", | |
14 | "FSMap", | |
15 | "filesystem", | |
16 | "get_filesystem_class", | |
17 | "get_fs_token_paths", | |
18 | "get_mapper", | |
19 | "open", | |
20 | "open_files", | |
21 | "registry", | |
22 | "caching", | |
23 | ] |
0 | # This file helps to compute a version number in source trees obtained from | |
1 | # git-archive tarball (such as those provided by githubs download-from-tag | |
2 | # feature). Distribution tarballs (built by setup.py sdist) and build | |
3 | # directories (produced by setup.py build) will contain a much shorter file | |
4 | # that just contains the computed version number. | |
5 | ||
6 | # This file is released into the public domain. Generated by | |
7 | # versioneer-0.18 (https://github.com/warner/python-versioneer) | |
8 | ||
9 | """Git implementation of _version.py.""" | |
10 | ||
11 | import errno | |
12 | import os | |
13 | import re | |
14 | import subprocess | |
15 | import sys | |
16 | ||
17 | ||
18 | def get_keywords(): | |
19 | """Get the keywords needed to look up the version information.""" | |
20 | # these strings will be replaced by git during git-archive. | |
21 | # setup.py/versioneer.py will grep for the variable names, so they must | |
22 | # each be defined on a line of their own. _version.py will just call | |
23 | # get_keywords(). | |
24 | git_refnames = " (tag: 0.6.0)" | |
25 | git_full = "8b59dc8c2c035db5793102b9513c46e6a1bd4fb0" | |
26 | git_date = "2019-11-13 10:37:40 -0600" | |
27 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} | |
28 | return keywords | |
29 | ||
30 | ||
31 | class VersioneerConfig: | |
32 | """Container for Versioneer configuration parameters.""" | |
33 | ||
34 | ||
35 | def get_config(): | |
36 | """Create, populate and return the VersioneerConfig() object.""" | |
37 | # these strings are filled in when 'setup.py versioneer' creates | |
38 | # _version.py | |
39 | cfg = VersioneerConfig() | |
40 | cfg.VCS = "git" | |
41 | cfg.style = "pep440" | |
42 | cfg.tag_prefix = "" | |
43 | cfg.parentdir_prefix = "None" | |
44 | cfg.versionfile_source = "fsspec/_version.py" | |
45 | cfg.verbose = False | |
46 | return cfg | |
47 | ||
48 | ||
49 | class NotThisMethod(Exception): | |
50 | """Exception raised if a method is not valid for the current scenario.""" | |
51 | ||
52 | ||
53 | LONG_VERSION_PY = {} | |
54 | HANDLERS = {} | |
55 | ||
56 | ||
57 | def register_vcs_handler(vcs, method): # decorator | |
58 | """Decorator to mark a method as the handler for a particular VCS.""" | |
59 | ||
60 | def decorate(f): | |
61 | """Store f in HANDLERS[vcs][method].""" | |
62 | if vcs not in HANDLERS: | |
63 | HANDLERS[vcs] = {} | |
64 | HANDLERS[vcs][method] = f | |
65 | return f | |
66 | ||
67 | return decorate | |
68 | ||
69 | ||
70 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): | |
71 | """Call the given command(s).""" | |
72 | assert isinstance(commands, list) | |
73 | p = None | |
74 | for c in commands: | |
75 | try: | |
76 | dispcmd = str([c] + args) | |
77 | # remember shell=False, so use git.cmd on windows, not just git | |
78 | p = subprocess.Popen( | |
79 | [c] + args, | |
80 | cwd=cwd, | |
81 | env=env, | |
82 | stdout=subprocess.PIPE, | |
83 | stderr=(subprocess.PIPE if hide_stderr else None), | |
84 | ) | |
85 | break | |
86 | except EnvironmentError: | |
87 | e = sys.exc_info()[1] | |
88 | if e.errno == errno.ENOENT: | |
89 | continue | |
90 | if verbose: | |
91 | print("unable to run %s" % dispcmd) | |
92 | print(e) | |
93 | return None, None | |
94 | else: | |
95 | if verbose: | |
96 | print("unable to find command, tried %s" % (commands,)) | |
97 | return None, None | |
98 | stdout = p.communicate()[0].strip() | |
99 | if sys.version_info[0] >= 3: | |
100 | stdout = stdout.decode() | |
101 | if p.returncode != 0: | |
102 | if verbose: | |
103 | print("unable to run %s (error)" % dispcmd) | |
104 | print("stdout was %s" % stdout) | |
105 | return None, p.returncode | |
106 | return stdout, p.returncode | |
107 | ||
108 | ||
109 | def versions_from_parentdir(parentdir_prefix, root, verbose): | |
110 | """Try to determine the version from the parent directory name. | |
111 | ||
112 | Source tarballs conventionally unpack into a directory that includes both | |
113 | the project name and a version string. We will also support searching up | |
114 | two directory levels for an appropriately named parent directory | |
115 | """ | |
116 | rootdirs = [] | |
117 | ||
118 | for i in range(3): | |
119 | dirname = os.path.basename(root) | |
120 | if dirname.startswith(parentdir_prefix): | |
121 | return { | |
122 | "version": dirname[len(parentdir_prefix) :], | |
123 | "full-revisionid": None, | |
124 | "dirty": False, | |
125 | "error": None, | |
126 | "date": None, | |
127 | } | |
128 | else: | |
129 | rootdirs.append(root) | |
130 | root = os.path.dirname(root) # up a level | |
131 | ||
132 | if verbose: | |
133 | print( | |
134 | "Tried directories %s but none started with prefix %s" | |
135 | % (str(rootdirs), parentdir_prefix) | |
136 | ) | |
137 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") | |
138 | ||
139 | ||
140 | @register_vcs_handler("git", "get_keywords") | |
141 | def git_get_keywords(versionfile_abs): | |
142 | """Extract version information from the given file.""" | |
143 | # the code embedded in _version.py can just fetch the value of these | |
144 | # keywords. When used from setup.py, we don't want to import _version.py, | |
145 | # so we do it with a regexp instead. This function is not used from | |
146 | # _version.py. | |
147 | keywords = {} | |
148 | try: | |
149 | f = open(versionfile_abs, "r") | |
150 | for line in f.readlines(): | |
151 | if line.strip().startswith("git_refnames ="): | |
152 | mo = re.search(r'=\s*"(.*)"', line) | |
153 | if mo: | |
154 | keywords["refnames"] = mo.group(1) | |
155 | if line.strip().startswith("git_full ="): | |
156 | mo = re.search(r'=\s*"(.*)"', line) | |
157 | if mo: | |
158 | keywords["full"] = mo.group(1) | |
159 | if line.strip().startswith("git_date ="): | |
160 | mo = re.search(r'=\s*"(.*)"', line) | |
161 | if mo: | |
162 | keywords["date"] = mo.group(1) | |
163 | f.close() | |
164 | except EnvironmentError: | |
165 | pass | |
166 | return keywords | |
167 | ||
168 | ||
169 | @register_vcs_handler("git", "keywords") | |
170 | def git_versions_from_keywords(keywords, tag_prefix, verbose): | |
171 | """Get version information from git keywords.""" | |
172 | if not keywords: | |
173 | raise NotThisMethod("no keywords at all, weird") | |
174 | date = keywords.get("date") | |
175 | if date is not None: | |
176 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant | |
177 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 | |
178 | # -like" string, which we must then edit to make compliant), because | |
179 | # it's been around since git-1.5.3, and it's too difficult to | |
180 | # discover which version we're using, or to work around using an | |
181 | # older one. | |
182 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) | |
183 | refnames = keywords["refnames"].strip() | |
184 | if refnames.startswith("$Format"): | |
185 | if verbose: | |
186 | print("keywords are unexpanded, not using") | |
187 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") | |
188 | refs = set([r.strip() for r in refnames.strip("()").split(",")]) | |
189 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of | |
190 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. | |
191 | TAG = "tag: " | |
192 | tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) | |
193 | if not tags: | |
194 | # Either we're using git < 1.8.3, or there really are no tags. We use | |
195 | # a heuristic: assume all version tags have a digit. The old git %d | |
196 | # expansion behaves like git log --decorate=short and strips out the | |
197 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish | |
198 | # between branches and tags. By ignoring refnames without digits, we | |
199 | # filter out many common branch names like "release" and | |
200 | # "stabilization", as well as "HEAD" and "master". | |
201 | tags = set([r for r in refs if re.search(r"\d", r)]) | |
202 | if verbose: | |
203 | print("discarding '%s', no digits" % ",".join(refs - tags)) | |
204 | if verbose: | |
205 | print("likely tags: %s" % ",".join(sorted(tags))) | |
206 | for ref in sorted(tags): | |
207 | # sorting will prefer e.g. "2.0" over "2.0rc1" | |
208 | if ref.startswith(tag_prefix): | |
209 | r = ref[len(tag_prefix) :] | |
210 | if verbose: | |
211 | print("picking %s" % r) | |
212 | return { | |
213 | "version": r, | |
214 | "full-revisionid": keywords["full"].strip(), | |
215 | "dirty": False, | |
216 | "error": None, | |
217 | "date": date, | |
218 | } | |
219 | # no suitable tags, so version is "0+unknown", but full hex is still there | |
220 | if verbose: | |
221 | print("no suitable tags, using unknown + full revision id") | |
222 | return { | |
223 | "version": "0+unknown", | |
224 | "full-revisionid": keywords["full"].strip(), | |
225 | "dirty": False, | |
226 | "error": "no suitable tags", | |
227 | "date": None, | |
228 | } | |
229 | ||
230 | ||
231 | @register_vcs_handler("git", "pieces_from_vcs") | |
232 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): | |
233 | """Get version from 'git describe' in the root of the source tree. | |
234 | ||
235 | This only gets called if the git-archive 'subst' keywords were *not* | |
236 | expanded, and _version.py hasn't already been rewritten with a short | |
237 | version string, meaning we're inside a checked out source tree. | |
238 | """ | |
239 | GITS = ["git"] | |
240 | if sys.platform == "win32": | |
241 | GITS = ["git.cmd", "git.exe"] | |
242 | ||
243 | out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) | |
244 | if rc != 0: | |
245 | if verbose: | |
246 | print("Directory %s not under git control" % root) | |
247 | raise NotThisMethod("'git rev-parse --git-dir' returned error") | |
248 | ||
249 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] | |
250 | # if there isn't one, this yields HEX[-dirty] (no NUM) | |
251 | describe_out, rc = run_command( | |
252 | GITS, | |
253 | [ | |
254 | "describe", | |
255 | "--tags", | |
256 | "--dirty", | |
257 | "--always", | |
258 | "--long", | |
259 | "--match", | |
260 | "%s*" % tag_prefix, | |
261 | ], | |
262 | cwd=root, | |
263 | ) | |
264 | # --long was added in git-1.5.5 | |
265 | if describe_out is None: | |
266 | raise NotThisMethod("'git describe' failed") | |
267 | describe_out = describe_out.strip() | |
268 | full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) | |
269 | if full_out is None: | |
270 | raise NotThisMethod("'git rev-parse' failed") | |
271 | full_out = full_out.strip() | |
272 | ||
273 | pieces = {} | |
274 | pieces["long"] = full_out | |
275 | pieces["short"] = full_out[:7] # maybe improved later | |
276 | pieces["error"] = None | |
277 | ||
278 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] | |
279 | # TAG might have hyphens. | |
280 | git_describe = describe_out | |
281 | ||
282 | # look for -dirty suffix | |
283 | dirty = git_describe.endswith("-dirty") | |
284 | pieces["dirty"] = dirty | |
285 | if dirty: | |
286 | git_describe = git_describe[: git_describe.rindex("-dirty")] | |
287 | ||
288 | # now we have TAG-NUM-gHEX or HEX | |
289 | ||
290 | if "-" in git_describe: | |
291 | # TAG-NUM-gHEX | |
292 | mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) | |
293 | if not mo: | |
294 | # unparseable. Maybe git-describe is misbehaving? | |
295 | pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out | |
296 | return pieces | |
297 | ||
298 | # tag | |
299 | full_tag = mo.group(1) | |
300 | if not full_tag.startswith(tag_prefix): | |
301 | if verbose: | |
302 | fmt = "tag '%s' doesn't start with prefix '%s'" | |
303 | print(fmt % (full_tag, tag_prefix)) | |
304 | pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( | |
305 | full_tag, | |
306 | tag_prefix, | |
307 | ) | |
308 | return pieces | |
309 | pieces["closest-tag"] = full_tag[len(tag_prefix) :] | |
310 | ||
311 | # distance: number of commits since tag | |
312 | pieces["distance"] = int(mo.group(2)) | |
313 | ||
314 | # commit: short hex revision ID | |
315 | pieces["short"] = mo.group(3) | |
316 | ||
317 | else: | |
318 | # HEX: no tags | |
319 | pieces["closest-tag"] = None | |
320 | count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) | |
321 | pieces["distance"] = int(count_out) # total number of commits | |
322 | ||
323 | # commit date: see ISO-8601 comment in git_versions_from_keywords() | |
324 | date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ | |
325 | 0 | |
326 | ].strip() | |
327 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) | |
328 | ||
329 | return pieces | |
330 | ||
331 | ||
332 | def plus_or_dot(pieces): | |
333 | """Return a + if we don't already have one, else return a .""" | |
334 | if "+" in pieces.get("closest-tag", ""): | |
335 | return "." | |
336 | return "+" | |
337 | ||
338 | ||
339 | def render_pep440(pieces): | |
340 | """Build up version string, with post-release "local version identifier". | |
341 | ||
342 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you | |
343 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty | |
344 | ||
345 | Exceptions: | |
346 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] | |
347 | """ | |
348 | if pieces["closest-tag"]: | |
349 | rendered = pieces["closest-tag"] | |
350 | if pieces["distance"] or pieces["dirty"]: | |
351 | rendered += plus_or_dot(pieces) | |
352 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) | |
353 | if pieces["dirty"]: | |
354 | rendered += ".dirty" | |
355 | else: | |
356 | # exception #1 | |
357 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) | |
358 | if pieces["dirty"]: | |
359 | rendered += ".dirty" | |
360 | return rendered | |
361 | ||
362 | ||
363 | def render_pep440_pre(pieces): | |
364 | """TAG[.post.devDISTANCE] -- No -dirty. | |
365 | ||
366 | Exceptions: | |
367 | 1: no tags. 0.post.devDISTANCE | |
368 | """ | |
369 | if pieces["closest-tag"]: | |
370 | rendered = pieces["closest-tag"] | |
371 | if pieces["distance"]: | |
372 | rendered += ".post.dev%d" % pieces["distance"] | |
373 | else: | |
374 | # exception #1 | |
375 | rendered = "0.post.dev%d" % pieces["distance"] | |
376 | return rendered | |
377 | ||
378 | ||
379 | def render_pep440_post(pieces): | |
380 | """TAG[.postDISTANCE[.dev0]+gHEX] . | |
381 | ||
382 | The ".dev0" means dirty. Note that .dev0 sorts backwards | |
383 | (a dirty tree will appear "older" than the corresponding clean one), | |
384 | but you shouldn't be releasing software with -dirty anyways. | |
385 | ||
386 | Exceptions: | |
387 | 1: no tags. 0.postDISTANCE[.dev0] | |
388 | """ | |
389 | if pieces["closest-tag"]: | |
390 | rendered = pieces["closest-tag"] | |
391 | if pieces["distance"] or pieces["dirty"]: | |
392 | rendered += ".post%d" % pieces["distance"] | |
393 | if pieces["dirty"]: | |
394 | rendered += ".dev0" | |
395 | rendered += plus_or_dot(pieces) | |
396 | rendered += "g%s" % pieces["short"] | |
397 | else: | |
398 | # exception #1 | |
399 | rendered = "0.post%d" % pieces["distance"] | |
400 | if pieces["dirty"]: | |
401 | rendered += ".dev0" | |
402 | rendered += "+g%s" % pieces["short"] | |
403 | return rendered | |
404 | ||
405 | ||
406 | def render_pep440_old(pieces): | |
407 | """TAG[.postDISTANCE[.dev0]] . | |
408 | ||
409 | The ".dev0" means dirty. | |
410 | ||
411 | Eexceptions: | |
412 | 1: no tags. 0.postDISTANCE[.dev0] | |
413 | """ | |
414 | if pieces["closest-tag"]: | |
415 | rendered = pieces["closest-tag"] | |
416 | if pieces["distance"] or pieces["dirty"]: | |
417 | rendered += ".post%d" % pieces["distance"] | |
418 | if pieces["dirty"]: | |
419 | rendered += ".dev0" | |
420 | else: | |
421 | # exception #1 | |
422 | rendered = "0.post%d" % pieces["distance"] | |
423 | if pieces["dirty"]: | |
424 | rendered += ".dev0" | |
425 | return rendered | |
426 | ||
427 | ||
428 | def render_git_describe(pieces): | |
429 | """TAG[-DISTANCE-gHEX][-dirty]. | |
430 | ||
431 | Like 'git describe --tags --dirty --always'. | |
432 | ||
433 | Exceptions: | |
434 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) | |
435 | """ | |
436 | if pieces["closest-tag"]: | |
437 | rendered = pieces["closest-tag"] | |
438 | if pieces["distance"]: | |
439 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) | |
440 | else: | |
441 | # exception #1 | |
442 | rendered = pieces["short"] | |
443 | if pieces["dirty"]: | |
444 | rendered += "-dirty" | |
445 | return rendered | |
446 | ||
447 | ||
448 | def render_git_describe_long(pieces): | |
449 | """TAG-DISTANCE-gHEX[-dirty]. | |
450 | ||
451 | Like 'git describe --tags --dirty --always -long'. | |
452 | The distance/hash is unconditional. | |
453 | ||
454 | Exceptions: | |
455 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) | |
456 | """ | |
457 | if pieces["closest-tag"]: | |
458 | rendered = pieces["closest-tag"] | |
459 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) | |
460 | else: | |
461 | # exception #1 | |
462 | rendered = pieces["short"] | |
463 | if pieces["dirty"]: | |
464 | rendered += "-dirty" | |
465 | return rendered | |
466 | ||
467 | ||
468 | def render(pieces, style): | |
469 | """Render the given version pieces into the requested style.""" | |
470 | if pieces["error"]: | |
471 | return { | |
472 | "version": "unknown", | |
473 | "full-revisionid": pieces.get("long"), | |
474 | "dirty": None, | |
475 | "error": pieces["error"], | |
476 | "date": None, | |
477 | } | |
478 | ||
479 | if not style or style == "default": | |
480 | style = "pep440" # the default | |
481 | ||
482 | if style == "pep440": | |
483 | rendered = render_pep440(pieces) | |
484 | elif style == "pep440-pre": | |
485 | rendered = render_pep440_pre(pieces) | |
486 | elif style == "pep440-post": | |
487 | rendered = render_pep440_post(pieces) | |
488 | elif style == "pep440-old": | |
489 | rendered = render_pep440_old(pieces) | |
490 | elif style == "git-describe": | |
491 | rendered = render_git_describe(pieces) | |
492 | elif style == "git-describe-long": | |
493 | rendered = render_git_describe_long(pieces) | |
494 | else: | |
495 | raise ValueError("unknown style '%s'" % style) | |
496 | ||
497 | return { | |
498 | "version": rendered, | |
499 | "full-revisionid": pieces["long"], | |
500 | "dirty": pieces["dirty"], | |
501 | "error": None, | |
502 | "date": pieces.get("date"), | |
503 | } | |
504 | ||
505 | ||
506 | def get_versions(): | |
507 | """Get version information or return default if unable to do so.""" | |
508 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have | |
509 | # __file__, we can work backwards from there to the root. Some | |
510 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which | |
511 | # case we can only use expanded keywords. | |
512 | ||
513 | cfg = get_config() | |
514 | verbose = cfg.verbose | |
515 | ||
516 | try: | |
517 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) | |
518 | except NotThisMethod: | |
519 | pass | |
520 | ||
521 | try: | |
522 | root = os.path.realpath(__file__) | |
523 | # versionfile_source is the relative path from the top of the source | |
524 | # tree (where the .git directory might live) to this file. Invert | |
525 | # this to find the root from __file__. | |
526 | for i in cfg.versionfile_source.split("/"): | |
527 | root = os.path.dirname(root) | |
528 | except NameError: | |
529 | return { | |
530 | "version": "0+unknown", | |
531 | "full-revisionid": None, | |
532 | "dirty": None, | |
533 | "error": "unable to find root of source tree", | |
534 | "date": None, | |
535 | } | |
536 | ||
537 | try: | |
538 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) | |
539 | return render(pieces, cfg.style) | |
540 | except NotThisMethod: | |
541 | pass | |
542 | ||
543 | try: | |
544 | if cfg.parentdir_prefix: | |
545 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) | |
546 | except NotThisMethod: | |
547 | pass | |
548 | ||
549 | return { | |
550 | "version": "0+unknown", | |
551 | "full-revisionid": None, | |
552 | "dirty": None, | |
553 | "error": "unable to compute version", | |
554 | "date": None, | |
555 | } |
0 | import os | |
1 | import io | |
2 | import functools | |
3 | import logging | |
4 | import math | |
5 | ||
6 | logger = logging.getLogger("fsspec") | |
7 | ||
8 | ||
9 | class BaseCache(object): | |
10 | """Pass-though cache: doesn't keep anything, calls every time | |
11 | ||
12 | Acts as base class for other cachers | |
13 | ||
14 | Parameters | |
15 | ---------- | |
16 | blocksize: int | |
17 | How far to read ahead in numbers of bytes | |
18 | fetcher: func | |
19 | Function of the form f(start, end) which gets bytes from remote as | |
20 | specified | |
21 | size: int | |
22 | How big this file is | |
23 | """ | |
24 | ||
25 | def __init__(self, blocksize, fetcher, size): | |
26 | self.blocksize = blocksize | |
27 | self.fetcher = fetcher | |
28 | self.size = size | |
29 | ||
30 | def _fetch(self, start, end): | |
31 | return self.fetcher(start, end) | |
32 | ||
33 | def __getitem__(self, item: slice): | |
34 | if not isinstance(item, slice): | |
35 | raise TypeError( | |
36 | "Cache indices must be a contiguous slice. Got {} instead.".format( | |
37 | type(item) | |
38 | ) | |
39 | ) | |
40 | if item.step and item.step != 1: | |
41 | raise ValueError( | |
42 | "Cache indices must be a contiguous slice. 'item' has step={}".format( | |
43 | item.step | |
44 | ) | |
45 | ) | |
46 | ||
47 | # handle endpoints | |
48 | if item.start is None: | |
49 | item = slice(0, item.stop) | |
50 | elif item.start < 0: | |
51 | item = slice(self.size + item.start, item.stop) | |
52 | if item.stop is None: | |
53 | item = slice(item.start, self.size) | |
54 | elif item.stop < 0: | |
55 | item = slice(item.start, self.size + item.stop) | |
56 | ||
57 | return self._fetch(item.start, item.stop) | |
58 | ||
59 | ||
60 | class MMapCache(BaseCache): | |
61 | """memory-mapped sparse file cache | |
62 | ||
63 | Opens temporary file, which is filled blocks-wise when data is requested. | |
64 | Ensure there is enough disc space in the temporary location. | |
65 | ||
66 | This cache method might only work on posix | |
67 | """ | |
68 | ||
69 | def __init__(self, blocksize, fetcher, size, location=None, blocks=None): | |
70 | super().__init__(blocksize, fetcher, size) | |
71 | self.blocks = set() if blocks is None else blocks | |
72 | self.location = location | |
73 | self.cache = self._makefile() | |
74 | ||
75 | def _makefile(self): | |
76 | import tempfile | |
77 | import mmap | |
78 | ||
79 | if self.size == 0: | |
80 | return bytearray() | |
81 | ||
82 | # posix version | |
83 | if self.location is None or not os.path.exists(self.location): | |
84 | if self.location is None: | |
85 | fd = tempfile.TemporaryFile() | |
86 | self.blocks = set() | |
87 | else: | |
88 | fd = io.open(self.location, "wb+") | |
89 | fd.seek(self.size - 1) | |
90 | fd.write(b"1") | |
91 | fd.flush() | |
92 | else: | |
93 | fd = io.open(self.location, "rb+") | |
94 | ||
95 | return mmap.mmap(fd.fileno(), self.size) | |
96 | ||
97 | def _fetch(self, start, end): | |
98 | start_block = start // self.blocksize | |
99 | end_block = end // self.blocksize | |
100 | need = [i for i in range(start_block, end_block + 1) if i not in self.blocks] | |
101 | while need: | |
102 | # TODO: not a for loop so we can consolidate blocks later to | |
103 | # make fewer fetch calls; this could be parallel | |
104 | i = need.pop(0) | |
105 | sstart = i * self.blocksize | |
106 | send = min(sstart + self.blocksize, self.size) | |
107 | self.cache[sstart:send] = self.fetcher(sstart, send) | |
108 | self.blocks.add(i) | |
109 | ||
110 | return self.cache[start:end] | |
111 | ||
112 | def __getstate__(self): | |
113 | state = self.__dict__.copy() | |
114 | # Remove the unpicklable entries. | |
115 | del state["cache"] | |
116 | return state | |
117 | ||
118 | def __setstate__(self, state): | |
119 | # Restore instance attributes | |
120 | self.__dict__.update(state) | |
121 | self.cache = self._makefile() | |
122 | ||
123 | ||
124 | class ReadAheadCache(BaseCache): | |
125 | """ Cache which reads only when we get beyond a block of data | |
126 | ||
127 | This is a much simpler version of BytesCache, and does not attempt to | |
128 | fill holes in the cache or keep fragments alive. It is best suited to | |
129 | many small reads in a sequential order (e.g., reading lines from a file). | |
130 | """ | |
131 | ||
132 | def __init__(self, blocksize, fetcher, size): | |
133 | super().__init__(blocksize, fetcher, size) | |
134 | self.cache = b"" | |
135 | self.start = 0 | |
136 | self.end = 0 | |
137 | ||
138 | def _fetch(self, start, end): | |
139 | end = min(self.size, end) | |
140 | l = end - start | |
141 | if start >= self.size: | |
142 | return b"" | |
143 | elif start >= self.start and end <= self.end: | |
144 | # cache hit | |
145 | return self.cache[start - self.start : end - self.start] | |
146 | elif self.start <= start < self.end: | |
147 | # partial hit | |
148 | part = self.cache[start - self.start :] | |
149 | l -= len(part) | |
150 | start = self.end | |
151 | else: | |
152 | # miss | |
153 | part = b"" | |
154 | end = min(self.size, end + self.blocksize) | |
155 | self.cache = self.fetcher(start, end) # new block replaces old | |
156 | self.start = start | |
157 | self.end = self.start + len(self.cache) | |
158 | return part + self.cache[:l] | |
159 | ||
160 | ||
161 | class BlockCache(BaseCache): | |
162 | """ | |
163 | Cache holding memory as a set of blocks. | |
164 | ||
165 | Requests are only ever made `blocksize` at a time, and are | |
166 | stored in an LRU cache. The least recently accessed block is | |
167 | discarded when more than `maxblocks` are stored. | |
168 | ||
169 | Parameters | |
170 | ---------- | |
171 | blocksize : int | |
172 | The number of bytes to store in each block. | |
173 | Requests are only ever made for `blocksize`, so this | |
174 | should balance the overhead of making a request against | |
175 | the granularity of the blocks. | |
176 | fetcher : Callable | |
177 | size : int | |
178 | The total size of the file being cached. | |
179 | maxblocks : int | |
180 | The maximum number of blocks to cache for. The maximum memory | |
181 | use for this cache is then ``blocksize * maxblocks``. | |
182 | """ | |
183 | ||
184 | def __init__(self, blocksize, fetcher, size, maxblocks=32): | |
185 | super().__init__(blocksize, fetcher, size) | |
186 | self.nblocks = math.ceil(size / blocksize) | |
187 | self.maxblocks = maxblocks | |
188 | self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block) | |
189 | ||
190 | def __repr__(self): | |
191 | return "<BlockCache blocksize={}, size={}, nblocks={}>".format( | |
192 | self.blocksize, self.size, self.nblocks | |
193 | ) | |
194 | ||
195 | def cache_info(self): | |
196 | """ | |
197 | The statistics on the block cache. | |
198 | ||
199 | Returns | |
200 | ---------- | |
201 | NamedTuple | |
202 | Returned directly from the LRU Cache used internally. | |
203 | """ | |
204 | return self._fetch_block_cached.cache_info() | |
205 | ||
206 | def __getstate__(self): | |
207 | state = self.__dict__ | |
208 | del state["_fetch_block_cached"] | |
209 | return state | |
210 | ||
211 | def __setstate__(self, state): | |
212 | self.__dict__.update(state) | |
213 | self._fetch_block_cached = functools.lru_cache(state["maxblocks"])( | |
214 | self._fetch_block | |
215 | ) | |
216 | ||
217 | def _fetch(self, start, end): | |
218 | if end < start: | |
219 | raise ValueError( | |
220 | "'end' ({}) is smaller than 'start' ({}).".format(end, start) | |
221 | ) | |
222 | ||
223 | if end > self.size: | |
224 | raise ValueError("'end={}' larger than size ('{}')".format(end, self.size)) | |
225 | ||
226 | # byte position -> block numbers | |
227 | start_block_number = start // self.blocksize | |
228 | end_block_number = end // self.blocksize | |
229 | ||
230 | # these are cached, so safe to do multiple calls for the same start and end. | |
231 | for block_number in range(start_block_number, end_block_number + 1): | |
232 | self._fetch_block(block_number) | |
233 | ||
234 | return self._read_cache( | |
235 | start, | |
236 | end, | |
237 | start_block_number=start_block_number, | |
238 | end_block_number=end_block_number, | |
239 | ) | |
240 | ||
241 | def _fetch_block(self, block_number): | |
242 | """ | |
243 | Fetch the block of data for `block_number`. | |
244 | """ | |
245 | if block_number > self.nblocks: | |
246 | raise ValueError( | |
247 | "'block_number={}' is greater than the number of blocks ({})".format( | |
248 | block_number, self.nblocks | |
249 | ) | |
250 | ) | |
251 | ||
252 | start = block_number * self.blocksize | |
253 | end = start + self.blocksize | |
254 | logger.info("BlockCache fetching block %d", block_number) | |
255 | block_contents = super()._fetch(start, end) | |
256 | return block_contents | |
257 | ||
258 | def _read_cache(self, start, end, start_block_number, end_block_number): | |
259 | """ | |
260 | Read from our block cache. | |
261 | ||
262 | Parameters | |
263 | ---------- | |
264 | start, end : int | |
265 | The start and end byte positions. | |
266 | start_block_number, end_block_number : int | |
267 | The start and end block numbers. | |
268 | """ | |
269 | start_pos = start % self.blocksize | |
270 | end_pos = end % self.blocksize | |
271 | ||
272 | if start_block_number == end_block_number: | |
273 | block = self._fetch_block_cached(start_block_number) | |
274 | return block[start_pos:end_pos] | |
275 | ||
276 | else: | |
277 | # read from the initial | |
278 | out = [] | |
279 | out.append(self._fetch_block_cached(start_block_number)[start_pos:]) | |
280 | ||
281 | # intermediate blocks | |
282 | # Note: it'd be nice to combine these into one big request. However | |
283 | # that doesn't play nicely with our LRU cache. | |
284 | for block_number in range(start_block_number + 1, end_block_number): | |
285 | out.append(self._fetch_block_cached(block_number)) | |
286 | ||
287 | # final block | |
288 | out.append(self._fetch_block_cached(end_block_number)[:end_pos]) | |
289 | ||
290 | return b"".join(out) | |
291 | ||
292 | ||
293 | class BytesCache(BaseCache): | |
294 | """Cache which holds data in a in-memory bytes object | |
295 | ||
296 | Implements read-ahead by the block size, for semi-random reads progressing | |
297 | through the file. | |
298 | ||
299 | Parameters | |
300 | ---------- | |
301 | trim: bool | |
302 | As we read more data, whether to discard the start of the buffer when | |
303 | we are more than a blocksize ahead of it. | |
304 | """ | |
305 | ||
306 | def __init__(self, blocksize, fetcher, size, trim=True): | |
307 | super().__init__(blocksize, fetcher, size) | |
308 | self.cache = b"" | |
309 | self.start = None | |
310 | self.end = None | |
311 | self.trim = trim | |
312 | ||
313 | def _fetch(self, start, end): | |
314 | # TODO: only set start/end after fetch, in case it fails? | |
315 | # is this where retry logic might go? | |
316 | if ( | |
317 | self.start is not None | |
318 | and start >= self.start | |
319 | and self.end is not None | |
320 | and end < self.end | |
321 | ): | |
322 | # cache hit: we have all the required data | |
323 | offset = start - self.start | |
324 | return self.cache[offset : offset + end - start] | |
325 | ||
326 | if self.blocksize: | |
327 | bend = min(self.size, end + self.blocksize) | |
328 | else: | |
329 | bend = end | |
330 | ||
331 | if bend == start or start > self.size: | |
332 | return b"" | |
333 | ||
334 | if (self.start is None or start < self.start) and ( | |
335 | self.end is None or end > self.end | |
336 | ): | |
337 | # First read, or extending both before and after | |
338 | self.cache = self.fetcher(start, bend) | |
339 | self.start = start | |
340 | elif start < self.start: | |
341 | if self.end - end > self.blocksize: | |
342 | self.cache = self.fetcher(start, bend) | |
343 | self.start = start | |
344 | else: | |
345 | new = self.fetcher(start, self.start) | |
346 | self.start = start | |
347 | self.cache = new + self.cache | |
348 | elif bend > self.end: | |
349 | if self.end > self.size: | |
350 | pass | |
351 | elif end - self.end > self.blocksize: | |
352 | self.cache = self.fetcher(start, bend) | |
353 | self.start = start | |
354 | else: | |
355 | new = self.fetcher(self.end, bend) | |
356 | self.cache = self.cache + new | |
357 | ||
358 | self.end = self.start + len(self.cache) | |
359 | offset = start - self.start | |
360 | out = self.cache[offset : offset + end - start] | |
361 | if self.trim: | |
362 | num = (self.end - self.start) // (self.blocksize + 1) | |
363 | if num > 1: | |
364 | self.start += self.blocksize * num | |
365 | self.cache = self.cache[self.blocksize * num :] | |
366 | return out | |
367 | ||
368 | def __len__(self): | |
369 | return len(self.cache) | |
370 | ||
371 | ||
372 | caches = { | |
373 | "none": BaseCache, | |
374 | "mmap": MMapCache, | |
375 | "bytes": BytesCache, | |
376 | "readahead": ReadAheadCache, | |
377 | "block": BlockCache, | |
378 | } |
0 | """Helper functions for a standard streaming compression API""" | |
1 | from bz2 import BZ2File | |
2 | from gzip import GzipFile | |
3 | from zipfile import ZipFile | |
4 | ||
5 | import fsspec.utils | |
6 | from fsspec.spec import AbstractBufferedFile | |
7 | ||
8 | ||
9 | def noop_file(file, mode, **kwargs): | |
10 | return file | |
11 | ||
12 | ||
13 | # should be functions of the form func(infile, mode=, **kwargs) -> file-like | |
14 | compr = {None: noop_file} | |
15 | ||
16 | ||
17 | def register_compression(name, callback, extensions, force=False): | |
18 | """Register an "inferable" file compression type. | |
19 | ||
20 | Registers transparent file compression type for use with fsspec.open. | |
21 | Compression can be specified by name in open, or "infer"-ed for any files | |
22 | ending with the given extensions. | |
23 | ||
24 | Args: | |
25 | name: (str) The compression type name. Eg. "gzip". | |
26 | callback: A callable of form (infile, mode, **kwargs) -> file-like. | |
27 | Accepts an input file-like object, the target mode and kwargs. | |
28 | Returns a wrapped file-like object. | |
29 | extensions: (str, Iterable[str]) A file extension, or list of file | |
30 | extensions for which to infer this compression scheme. Eg. "gz". | |
31 | force: (bool) Force re-registration of compression type or extensions. | |
32 | ||
33 | Raises: | |
34 | ValueError: If name or extensions already registered, and not force. | |
35 | ||
36 | """ | |
37 | if isinstance(extensions, str): | |
38 | extensions = [extensions] | |
39 | ||
40 | # Validate registration | |
41 | if name in compr and not force: | |
42 | raise ValueError("Duplicate compression registration: %s" % name) | |
43 | ||
44 | for ext in extensions: | |
45 | if ext in fsspec.utils.compressions and not force: | |
46 | raise ValueError( | |
47 | "Duplicate compression file extension: %s (%s)" % (ext, name) | |
48 | ) | |
49 | ||
50 | compr[name] = callback | |
51 | ||
52 | for ext in extensions: | |
53 | fsspec.utils.compressions[ext] = name | |
54 | ||
55 | ||
56 | def unzip(infile, mode="rb", filename=None, **kwargs): | |
57 | if "r" not in mode: | |
58 | filename = filename or "file" | |
59 | z = ZipFile(infile, mode="w", **kwargs) | |
60 | fo = z.open(filename, mode="w") | |
61 | fo.close = lambda closer=fo.close: closer() or z.close() | |
62 | return fo | |
63 | z = ZipFile(infile) | |
64 | if filename is None: | |
65 | filename = z.namelist()[0] | |
66 | return z.open(filename, mode="r", **kwargs) | |
67 | ||
68 | ||
69 | register_compression("zip", unzip, "zip") | |
70 | register_compression("bz2", BZ2File, "bz2") | |
71 | register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz") | |
72 | ||
73 | try: | |
74 | import lzma | |
75 | ||
76 | register_compression("lzma", lzma.LZMAFile, "xz") | |
77 | register_compression("xz", lzma.LZMAFile, "xz", force=True) | |
78 | except ImportError: | |
79 | pass | |
80 | ||
81 | try: | |
82 | import lzmaffi | |
83 | ||
84 | register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True) | |
85 | register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) | |
86 | except ImportError: | |
87 | pass | |
88 | ||
89 | ||
90 | class SnappyFile(AbstractBufferedFile): | |
91 | def __init__(self, infile, mode, **kwargs): | |
92 | import snappy | |
93 | ||
94 | self.details = {"size": 999999999} # not true, but OK if we don't seek | |
95 | super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs) | |
96 | self.infile = infile | |
97 | if "r" in mode: | |
98 | self.codec = snappy.StreamDecompressor() | |
99 | else: | |
100 | self.codec = snappy.StreamCompressor() | |
101 | ||
102 | def _upload_chunk(self, final=False): | |
103 | self.buffer.seek(0) | |
104 | out = self.codec.add_chunk(self.buffer.read()) | |
105 | self.infile.write(out) | |
106 | return True | |
107 | ||
108 | def seek(self, loc, whence=0): | |
109 | raise NotImplementedError("SnappyFile is not seekable") | |
110 | ||
111 | def seekable(self): | |
112 | return False | |
113 | ||
114 | def _fetch_range(self, start, end): | |
115 | """Get the specified set of bytes from remote""" | |
116 | data = self.infile.read(end - start) | |
117 | return self.codec.decompress(data) | |
118 | ||
119 | ||
120 | try: | |
121 | import snappy | |
122 | ||
123 | snappy.compress | |
124 | # Snappy may use the .sz file extension, but this is not part of the | |
125 | # standard implementation. | |
126 | register_compression("snappy", SnappyFile, []) | |
127 | ||
128 | except (ImportError, NameError): | |
129 | pass | |
130 | ||
131 | try: | |
132 | import lz4.frame | |
133 | ||
134 | register_compression("lz4", lz4.frame.open, "lz4") | |
135 | except ImportError: | |
136 | pass | |
137 | ||
138 | try: | |
139 | import zstandard as zstd | |
140 | ||
141 | def zstandard_file(infile, mode="rb"): | |
142 | if "r" in mode: | |
143 | cctx = zstd.ZstdDecompressor() | |
144 | return cctx.stream_reader(infile) | |
145 | else: | |
146 | cctx = zstd.ZstdCompressor(level=10) | |
147 | return cctx.stream_writer(infile) | |
148 | ||
149 | register_compression("zstd", zstandard_file, "zst") | |
150 | except ImportError: | |
151 | pass |
0 | import os | |
1 | import shutil | |
2 | import subprocess | |
3 | import sys | |
4 | import time | |
5 | ||
6 | import pytest | |
7 | ||
8 | import fsspec | |
9 | from fsspec.implementations.cached import CachingFileSystem | |
10 | ||
11 | ||
12 | @pytest.fixture() | |
13 | def m(): | |
14 | """ | |
15 | Fixture providing a memory filesystem. | |
16 | """ | |
17 | m = fsspec.filesystem("memory") | |
18 | m.store.clear() | |
19 | try: | |
20 | yield m | |
21 | finally: | |
22 | m.store.clear() | |
23 | ||
24 | ||
25 | @pytest.fixture | |
26 | def ftp_writable(tmpdir): | |
27 | """ | |
28 | Fixture providing a writable FTP filesystem. | |
29 | """ | |
30 | pytest.importorskip("pyftpdlib") | |
31 | from fsspec.implementations.ftp import FTPFileSystem | |
32 | ||
33 | FTPFileSystem.clear_instance_cache() # remove lingering connections | |
34 | CachingFileSystem.clear_instance_cache() | |
35 | d = str(tmpdir) | |
36 | with open(os.path.join(d, "out"), "wb") as f: | |
37 | f.write(b"hello" * 10000) | |
38 | P = subprocess.Popen( | |
39 | [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] | |
40 | ) | |
41 | try: | |
42 | time.sleep(1) | |
43 | yield "localhost", 2121, "user", "pass" | |
44 | finally: | |
45 | P.terminate() | |
46 | P.wait() | |
47 | try: | |
48 | shutil.rmtree(tmpdir) | |
49 | except Exception: | |
50 | pass |
0 | from __future__ import print_function, division, absolute_import | |
1 | ||
2 | import io | |
3 | import os | |
4 | import logging | |
5 | from .compression import compr | |
6 | from .utils import ( | |
7 | infer_compression, | |
8 | build_name_function, | |
9 | update_storage_options, | |
10 | stringify_path, | |
11 | ) | |
12 | from .registry import get_filesystem_class | |
13 | ||
14 | # for backwards compat, we export cache things from here too | |
15 | from .caching import ( # noqa: F401 | |
16 | BaseCache, | |
17 | MMapCache, | |
18 | ReadAheadCache, | |
19 | BytesCache, | |
20 | BlockCache, | |
21 | caches, | |
22 | ) | |
23 | ||
24 | logger = logging.getLogger("fsspec") | |
25 | ||
26 | ||
27 | class OpenFile(object): | |
28 | """ | |
29 | File-like object to be used in a context | |
30 | ||
31 | Can layer (buffered) text-mode and compression over any file-system, which | |
32 | are typically binary-only. | |
33 | ||
34 | These instances are safe to serialize, as the low-level file object | |
35 | is not created until invoked using `with`. | |
36 | ||
37 | Parameters | |
38 | ---------- | |
39 | fs: FileSystem | |
40 | The file system to use for opening the file. Should match the interface | |
41 | of ``dask.bytes.local.LocalFileSystem``. | |
42 | path: str | |
43 | Location to open | |
44 | mode: str like 'rb', optional | |
45 | Mode of the opened file | |
46 | compression: str or None, optional | |
47 | Compression to apply | |
48 | encoding: str or None, optional | |
49 | The encoding to use if opened in text mode. | |
50 | errors: str or None, optional | |
51 | How to handle encoding errors if opened in text mode. | |
52 | newline: None or str | |
53 | Passed to TextIOWrapper in text mode, how to handle line endings. | |
54 | """ | |
55 | ||
56 | def __init__( | |
57 | self, | |
58 | fs, | |
59 | path, | |
60 | mode="rb", | |
61 | compression=None, | |
62 | encoding=None, | |
63 | errors=None, | |
64 | newline=None, | |
65 | ): | |
66 | self.fs = fs | |
67 | self.path = path | |
68 | self.mode = mode | |
69 | self.compression = get_compression(path, compression) | |
70 | self.encoding = encoding | |
71 | self.errors = errors | |
72 | self.newline = newline | |
73 | self.fobjects = [] | |
74 | ||
75 | def __reduce__(self): | |
76 | return ( | |
77 | OpenFile, | |
78 | ( | |
79 | self.fs, | |
80 | self.path, | |
81 | self.mode, | |
82 | self.compression, | |
83 | self.encoding, | |
84 | self.errors, | |
85 | ), | |
86 | ) | |
87 | ||
88 | def __repr__(self): | |
89 | return "<OpenFile '{}'>".format(self.path) | |
90 | ||
91 | def __fspath__(self): | |
92 | return self.path | |
93 | ||
94 | def __enter__(self): | |
95 | mode = self.mode.replace("t", "").replace("b", "") + "b" | |
96 | ||
97 | f = self.fs.open(self.path, mode=mode) | |
98 | ||
99 | self.fobjects = [f] | |
100 | ||
101 | if self.compression is not None: | |
102 | compress = compr[self.compression] | |
103 | f = compress(f, mode=mode[0]) | |
104 | self.fobjects.append(f) | |
105 | ||
106 | if "b" not in self.mode: | |
107 | # assume, for example, that 'r' is equivalent to 'rt' as in builtin | |
108 | f = io.TextIOWrapper( | |
109 | f, encoding=self.encoding, errors=self.errors, newline=self.newline | |
110 | ) | |
111 | self.fobjects.append(f) | |
112 | ||
113 | return self.fobjects[-1] | |
114 | ||
115 | def __exit__(self, *args): | |
116 | self.close() | |
117 | ||
118 | def __del__(self): | |
119 | self.close() | |
120 | ||
121 | def open(self): | |
122 | """Materialise this as a real open file without context | |
123 | ||
124 | The file should be explicitly closed to avoid enclosed open file | |
125 | instances persisting | |
126 | """ | |
127 | return self.__enter__() | |
128 | ||
129 | def close(self): | |
130 | """Close all encapsulated file objects""" | |
131 | for f in reversed(self.fobjects): | |
132 | if "r" not in self.mode and not f.closed: | |
133 | f.flush() | |
134 | f.close() | |
135 | self.fobjects = [] | |
136 | ||
137 | ||
138 | def open_files( | |
139 | urlpath, | |
140 | mode="rb", | |
141 | compression=None, | |
142 | encoding="utf8", | |
143 | errors=None, | |
144 | name_function=None, | |
145 | num=1, | |
146 | protocol=None, | |
147 | newline=None, | |
148 | **kwargs | |
149 | ): | |
150 | """ Given a path or paths, return a list of ``OpenFile`` objects. | |
151 | ||
152 | For writing, a str path must contain the "*" character, which will be filled | |
153 | in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2. | |
154 | ||
155 | For either reading or writing, can instead provide explicit list of paths. | |
156 | ||
157 | Parameters | |
158 | ---------- | |
159 | urlpath: string or list | |
160 | Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` | |
161 | to read from alternative filesystems. To read from multiple files you | |
162 | can pass a globstring or a list of paths, with the caveat that they | |
163 | must all have the same protocol. | |
164 | mode: 'rb', 'wt', etc. | |
165 | compression: string | |
166 | Compression to use. See ``dask.bytes.compression.files`` for options. | |
167 | encoding: str | |
168 | For text mode only | |
169 | errors: None or str | |
170 | Passed to TextIOWrapper in text mode | |
171 | name_function: function or None | |
172 | if opening a set of files for writing, those files do not yet exist, | |
173 | so we need to generate their names by formatting the urlpath for | |
174 | each sequence number | |
175 | num: int [1] | |
176 | if writing mode, number of files we expect to create (passed to | |
177 | name+function) | |
178 | protocol: str or None | |
179 | If given, overrides the protocol found in the URL. | |
180 | newline: bytes or None | |
181 | Used for line terminator in text mode. If None, uses system default; | |
182 | if blank, uses no translation. | |
183 | **kwargs: dict | |
184 | Extra options that make sense to a particular storage connection, e.g. | |
185 | host, port, username, password, etc. | |
186 | ||
187 | Examples | |
188 | -------- | |
189 | >>> files = open_files('2015-*-*.csv') # doctest: +SKIP | |
190 | >>> files = open_files( | |
191 | ... 's3://bucket/2015-*-*.csv.gz', compression='gzip' | |
192 | ... ) # doctest: +SKIP | |
193 | ||
194 | Returns | |
195 | ------- | |
196 | List of ``OpenFile`` objects. | |
197 | """ | |
198 | fs, fs_token, paths = get_fs_token_paths( | |
199 | urlpath, | |
200 | mode, | |
201 | num=num, | |
202 | name_function=name_function, | |
203 | storage_options=kwargs, | |
204 | protocol=protocol, | |
205 | ) | |
206 | return [ | |
207 | OpenFile( | |
208 | fs, | |
209 | path, | |
210 | mode=mode, | |
211 | compression=compression, | |
212 | encoding=encoding, | |
213 | errors=errors, | |
214 | newline=newline, | |
215 | ) | |
216 | for path in paths | |
217 | ] | |
218 | ||
219 | ||
220 | def open( | |
221 | urlpath, | |
222 | mode="rb", | |
223 | compression=None, | |
224 | encoding="utf8", | |
225 | errors=None, | |
226 | protocol=None, | |
227 | newline=None, | |
228 | **kwargs | |
229 | ): | |
230 | """ Given a path or paths, return one ``OpenFile`` object. | |
231 | ||
232 | Parameters | |
233 | ---------- | |
234 | urlpath: string or list | |
235 | Absolute or relative filepath. Prefix with a protocol like ``s3://`` | |
236 | to read from alternative filesystems. Should not include glob | |
237 | character(s). | |
238 | mode: 'rb', 'wt', etc. | |
239 | compression: string | |
240 | Compression to use. See ``dask.bytes.compression.files`` for options. | |
241 | encoding: str | |
242 | For text mode only | |
243 | errors: None or str | |
244 | Passed to TextIOWrapper in text mode | |
245 | protocol: str or None | |
246 | If given, overrides the protocol found in the URL. | |
247 | newline: bytes or None | |
248 | Used for line terminator in text mode. If None, uses system default; | |
249 | if blank, uses no translation. | |
250 | **kwargs: dict | |
251 | Extra options that make sense to a particular storage connection, e.g. | |
252 | host, port, username, password, etc. | |
253 | ||
254 | Examples | |
255 | -------- | |
256 | >>> openfile = open('2015-01-01.csv') # doctest: +SKIP | |
257 | >>> openfile = open( | |
258 | ... 's3://bucket/2015-01-01.csv.gz', | |
259 | ... compression='gzip' | |
260 | ... ) # doctest: +SKIP | |
261 | >>> with openfile as f: | |
262 | ... df = pd.read_csv(f) # doctest: +SKIP | |
263 | ||
264 | Returns | |
265 | ------- | |
266 | ``OpenFile`` object. | |
267 | """ | |
268 | return open_files( | |
269 | [urlpath], | |
270 | mode, | |
271 | compression, | |
272 | encoding, | |
273 | errors, | |
274 | protocol, | |
275 | newline=newline, | |
276 | **kwargs | |
277 | )[0] | |
278 | ||
279 | ||
280 | def get_compression(urlpath, compression): | |
281 | if compression == "infer": | |
282 | compression = infer_compression(urlpath) | |
283 | if compression is not None and compression not in compr: | |
284 | raise ValueError("Compression type %s not supported" % compression) | |
285 | return compression | |
286 | ||
287 | ||
288 | def split_protocol(urlpath): | |
289 | """Return protocol, path pair""" | |
290 | urlpath = stringify_path(urlpath) | |
291 | if "://" in urlpath: | |
292 | protocol, path = urlpath.split("://", 1) | |
293 | if len(protocol) > 1: | |
294 | # excludes Windows paths | |
295 | return protocol, path | |
296 | return None, urlpath | |
297 | ||
298 | ||
299 | def strip_protocol(urlpath): | |
300 | """Return only path part of full URL, according to appropriate backend""" | |
301 | protocol, _ = split_protocol(urlpath) | |
302 | cls = get_filesystem_class(protocol) | |
303 | return cls._strip_protocol(urlpath) | |
304 | ||
305 | ||
306 | def expand_paths_if_needed(paths, mode, num, fs, name_function): | |
307 | """Expand paths if they have a ``*`` in them. | |
308 | ||
309 | :param paths: list of paths | |
310 | mode: str | |
311 | Mode in which to open files. | |
312 | num: int | |
313 | If opening in writing mode, number of files we expect to create. | |
314 | fs: filesystem object | |
315 | name_function: callable | |
316 | If opening in writing mode, this callable is used to generate path | |
317 | names. Names are generated for each partition by | |
318 | ``urlpath.replace('*', name_function(partition_index))``. | |
319 | :return: list of paths | |
320 | """ | |
321 | expanded_paths = [] | |
322 | paths = list(paths) | |
323 | if "w" in mode and sum([1 for p in paths if "*" in p]) > 1: | |
324 | raise ValueError("When writing data, only one filename mask can be specified.") | |
325 | elif "w" in mode: | |
326 | num = max(num, len(paths)) | |
327 | for curr_path in paths: | |
328 | if "*" in curr_path: | |
329 | if "w" in mode: | |
330 | # expand using name_function | |
331 | expanded_paths.extend(_expand_paths(curr_path, name_function, num)) | |
332 | else: | |
333 | # expand using glob | |
334 | expanded_paths.extend(fs.glob(curr_path)) | |
335 | else: | |
336 | expanded_paths.append(curr_path) | |
337 | # if we generated more paths that asked for, trim the list | |
338 | if "w" in mode and len(expanded_paths) > num: | |
339 | expanded_paths = expanded_paths[:num] | |
340 | return expanded_paths | |
341 | ||
342 | ||
343 | def get_fs_token_paths( | |
344 | urlpath, mode="rb", num=1, name_function=None, storage_options=None, protocol=None | |
345 | ): | |
346 | """Filesystem, deterministic token, and paths from a urlpath and options. | |
347 | ||
348 | Parameters | |
349 | ---------- | |
350 | urlpath: string or iterable | |
351 | Absolute or relative filepath, URL (may include protocols like | |
352 | ``s3://``), or globstring pointing to data. | |
353 | mode: str, optional | |
354 | Mode in which to open files. | |
355 | num: int, optional | |
356 | If opening in writing mode, number of files we expect to create. | |
357 | name_function: callable, optional | |
358 | If opening in writing mode, this callable is used to generate path | |
359 | names. Names are generated for each partition by | |
360 | ``urlpath.replace('*', name_function(partition_index))``. | |
361 | storage_options: dict, optional | |
362 | Additional keywords to pass to the filesystem class. | |
363 | protocol: str or None | |
364 | To override the protocol specifier in the URL | |
365 | """ | |
366 | if isinstance(urlpath, (list, tuple)): | |
367 | if not urlpath: | |
368 | raise ValueError("empty urlpath sequence") | |
369 | protocols, paths = zip(*map(split_protocol, urlpath)) | |
370 | protocol = protocol or protocols[0] | |
371 | if not all(p == protocol for p in protocols): | |
372 | raise ValueError( | |
373 | "When specifying a list of paths, all paths must " | |
374 | "share the same protocol" | |
375 | ) | |
376 | cls = get_filesystem_class(protocol) | |
377 | optionss = list(map(cls._get_kwargs_from_urls, urlpath)) | |
378 | paths = [cls._strip_protocol(u) for u in urlpath] | |
379 | options = optionss[0] | |
380 | if not all(o == options for o in optionss): | |
381 | raise ValueError( | |
382 | "When specifying a list of paths, all paths must " | |
383 | "share the same file-system options" | |
384 | ) | |
385 | update_storage_options(options, storage_options) | |
386 | fs = cls(**options) | |
387 | paths = expand_paths_if_needed(paths, mode, num, fs, name_function) | |
388 | ||
389 | elif isinstance(urlpath, str) or hasattr(urlpath, "name"): | |
390 | protocols, path = split_protocol(urlpath) | |
391 | protocol = protocol or protocols | |
392 | cls = get_filesystem_class(protocol) | |
393 | ||
394 | options = cls._get_kwargs_from_urls(urlpath) | |
395 | path = cls._strip_protocol(urlpath) | |
396 | update_storage_options(options, storage_options) | |
397 | fs = cls(**options) | |
398 | ||
399 | if "w" in mode: | |
400 | paths = _expand_paths(path, name_function, num) | |
401 | elif "*" in path: | |
402 | paths = sorted(fs.glob(path)) | |
403 | else: | |
404 | paths = [path] | |
405 | ||
406 | else: | |
407 | raise TypeError("url type not understood: %s" % urlpath) | |
408 | ||
409 | return fs, fs._fs_token, paths | |
410 | ||
411 | ||
412 | def _expand_paths(path, name_function, num): | |
413 | if isinstance(path, str): | |
414 | if path.count("*") > 1: | |
415 | raise ValueError("Output path spec must contain exactly one '*'.") | |
416 | elif "*" not in path: | |
417 | path = os.path.join(path, "*.part") | |
418 | ||
419 | if name_function is None: | |
420 | name_function = build_name_function(num - 1) | |
421 | ||
422 | paths = [path.replace("*", name_function(i)) for i in range(num)] | |
423 | if paths != sorted(paths): | |
424 | logger.warning( | |
425 | "In order to preserve order between partitions" | |
426 | " paths created with ``name_function`` should " | |
427 | "sort to partition order" | |
428 | ) | |
429 | elif isinstance(path, (tuple, list)): | |
430 | assert len(path) == num | |
431 | paths = list(path) | |
432 | else: | |
433 | raise ValueError( | |
434 | "Path should be either\n" | |
435 | "1. A list of paths: ['foo.json', 'bar.json', ...]\n" | |
436 | "2. A directory: 'foo/\n" | |
437 | "3. A path with a '*' in it: 'foo.*.json'" | |
438 | ) | |
439 | return paths |
0 | from __future__ import print_function | |
1 | import os | |
2 | import stat | |
3 | from errno import ENOENT, EIO | |
4 | from fuse import Operations, FuseOSError | |
5 | import threading | |
6 | import time | |
7 | from fuse import FUSE | |
8 | ||
9 | ||
10 | class FUSEr(Operations): | |
11 | def __init__(self, fs, path): | |
12 | self.fs = fs | |
13 | self.cache = {} | |
14 | self.root = path.rstrip("/") + "/" | |
15 | self.counter = 0 | |
16 | ||
17 | def getattr(self, path, fh=None): | |
18 | path = "".join([self.root, path.lstrip("/")]).rstrip("/") | |
19 | try: | |
20 | info = self.fs.info(path) | |
21 | except FileNotFoundError: | |
22 | raise FuseOSError(ENOENT) | |
23 | data = {"st_uid": 1000, "st_gid": 1000} | |
24 | perm = 0o777 | |
25 | ||
26 | if info["type"] != "file": | |
27 | data["st_mode"] = stat.S_IFDIR | perm | |
28 | data["st_size"] = 0 | |
29 | data["st_blksize"] = 0 | |
30 | else: | |
31 | data["st_mode"] = stat.S_IFREG | perm | |
32 | data["st_size"] = info["size"] | |
33 | data["st_blksize"] = 5 * 2 ** 20 | |
34 | data["st_nlink"] = 1 | |
35 | data["st_atime"] = time.time() | |
36 | data["st_ctime"] = time.time() | |
37 | data["st_mtime"] = time.time() | |
38 | return data | |
39 | ||
40 | def readdir(self, path, fh): | |
41 | path = "".join([self.root, path.lstrip("/")]) | |
42 | files = self.fs.ls(path, False) | |
43 | files = [os.path.basename(f.rstrip("/")) for f in files] | |
44 | return [".", ".."] + files | |
45 | ||
46 | def mkdir(self, path, mode): | |
47 | path = "".join([self.root, path.lstrip("/")]) | |
48 | self.fs.mkdir(path) | |
49 | return 0 | |
50 | ||
51 | def rmdir(self, path): | |
52 | path = "".join([self.root, path.lstrip("/")]) | |
53 | self.fs.rmdir(path) | |
54 | return 0 | |
55 | ||
56 | def read(self, path, size, offset, fh): | |
57 | f = self.cache[fh] | |
58 | f.seek(offset) | |
59 | out = f.read(size) | |
60 | return out | |
61 | ||
62 | def write(self, path, data, offset, fh): | |
63 | f = self.cache[fh] | |
64 | f.write(data) | |
65 | return len(data) | |
66 | ||
67 | def create(self, path, flags, fi=None): | |
68 | fn = "".join([self.root, path.lstrip("/")]) | |
69 | f = self.fs.open(fn, "wb") | |
70 | self.cache[self.counter] = f | |
71 | self.counter += 1 | |
72 | return self.counter - 1 | |
73 | ||
74 | def open(self, path, flags): | |
75 | fn = "".join([self.root, path.lstrip("/")]) | |
76 | if flags % 2 == 0: | |
77 | # read | |
78 | mode = "rb" | |
79 | else: | |
80 | # write/create | |
81 | mode = "wb" | |
82 | self.cache[self.counter] = self.fs.open(fn, mode) | |
83 | self.counter += 1 | |
84 | return self.counter - 1 | |
85 | ||
86 | def truncate(self, path, length, fh=None): | |
87 | fn = "".join([self.root, path.lstrip("/")]) | |
88 | if length != 0: | |
89 | raise NotImplementedError | |
90 | # maybe should be no-op since open with write sets size to zero anyway | |
91 | self.fs.touch(fn) | |
92 | ||
93 | def unlink(self, path): | |
94 | fn = "".join([self.root, path.lstrip("/")]) | |
95 | try: | |
96 | self.fs.rm(fn, False) | |
97 | except (IOError, FileNotFoundError): | |
98 | raise FuseOSError(EIO) | |
99 | ||
100 | def release(self, path, fh): | |
101 | try: | |
102 | if fh in self.cache: | |
103 | f = self.cache[fh] | |
104 | f.close() | |
105 | self.cache.pop(fh) | |
106 | except Exception as e: | |
107 | print(e) | |
108 | return 0 | |
109 | ||
110 | def chmod(self, path, mode): | |
111 | raise NotImplementedError | |
112 | ||
113 | ||
114 | def run(fs, path, mount_point, foreground=True, threads=False): | |
115 | """ Mount stuff in a local directory | |
116 | ||
117 | This uses fusepy to make it appear as if a given path on an fsspec | |
118 | instance is in fact resident within the local file-system. | |
119 | ||
120 | This requires that fusepy by installed, and that FUSE be available on | |
121 | the system (typically requiring a package to be installed with | |
122 | apt, yum, brew, etc.). | |
123 | ||
124 | Parameters | |
125 | ---------- | |
126 | fs: file-system instance | |
127 | From one of the compatible implementations | |
128 | path: str | |
129 | Location on that file-system to regard as the root directory to | |
130 | mount. Note that you typically should include the terminating "/" | |
131 | character. | |
132 | mount_point: str | |
133 | An empty directory on the local file-system where the contents of | |
134 | the remote path will appear | |
135 | foreground: bool | |
136 | Whether or not calling this function will block. Operation will | |
137 | typically be more stable if True. | |
138 | threads: bool | |
139 | Whether or not to create threads when responding to file operations | |
140 | within the mounter directory. Operation will typically be more | |
141 | stable if False. | |
142 | ||
143 | """ | |
144 | func = lambda: FUSE( | |
145 | FUSEr(fs, path), mount_point, nothreads=not threads, foreground=True | |
146 | ) | |
147 | if foreground is False: | |
148 | th = threading.Thread(target=func) | |
149 | th.daemon = True | |
150 | th.start() | |
151 | return th | |
152 | else: # pragma: no cover | |
153 | try: | |
154 | func() | |
155 | except KeyboardInterrupt: | |
156 | pass |
0 | import time | |
1 | import pickle | |
2 | import logging | |
3 | import os | |
4 | import hashlib | |
5 | import tempfile | |
6 | import inspect | |
7 | from fsspec import AbstractFileSystem, filesystem | |
8 | from fsspec.spec import AbstractBufferedFile | |
9 | from fsspec.core import MMapCache, BaseCache | |
10 | ||
11 | logger = logging.getLogger("fsspec") | |
12 | ||
13 | ||
14 | class CachingFileSystem(AbstractFileSystem): | |
15 | """Locally caching filesystem, layer over any other FS | |
16 | ||
17 | This class implements chunk-wise local storage of remote files, for quick | |
18 | access after the initial download. The files are stored in a given | |
19 | directory with random hashes for the filenames. If no directory is given, | |
20 | a temporary one is used, which should be cleaned up by the OS after the | |
21 | process ends. The files themselves as sparse (as implemented in | |
22 | MMapCache), so only the data which is accessed takes up space. | |
23 | ||
24 | Restrictions: | |
25 | ||
26 | - the block-size must be the same for each access of a given file, unless | |
27 | all blocks of the file have already been read | |
28 | - caching can only be applied to file-systems which produce files | |
29 | derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also | |
30 | allowed, for testing | |
31 | """ | |
32 | ||
33 | protocol = ("blockcache", "cached") | |
34 | ||
35 | def __init__( | |
36 | self, | |
37 | target_protocol=None, | |
38 | cache_storage="TMP", | |
39 | cache_check=10, | |
40 | check_files=False, | |
41 | expiry_time=604800, | |
42 | target_options=None, | |
43 | **kwargs | |
44 | ): | |
45 | """ | |
46 | ||
47 | Parameters | |
48 | ---------- | |
49 | target_protocol: str | |
50 | Target fielsystem protocol | |
51 | cache_storage: str or list(str) | |
52 | Location to store files. If "TMP", this is a temporary directory, | |
53 | and will be cleaned up by the OS when this process ends (or later). | |
54 | If a list, each location will be tried in the order given, but | |
55 | only the last will be considered writable. | |
56 | cache_check: int | |
57 | Number of seconds between reload of cache metadata | |
58 | check_files: bool | |
59 | Whether to explicitly see if the UID of the remote file matches | |
60 | the stored one before using. Warning: some file systems such as | |
61 | HTTP cannot reliably give a unique hash of the contents of some | |
62 | path, so be sure to set this option to False. | |
63 | expiry_time: int | |
64 | The time in seconds after which a local copy is considered useless. | |
65 | Set to falsy to prevent expiry. The default is equivalent to one | |
66 | week. | |
67 | target_options: dict or None | |
68 | Passed to the instantiation of the FS, if fs is None. | |
69 | """ | |
70 | if self._cached: | |
71 | return | |
72 | super().__init__(**kwargs) | |
73 | if cache_storage == "TMP": | |
74 | storage = [tempfile.mkdtemp()] | |
75 | else: | |
76 | if isinstance(cache_storage, str): | |
77 | storage = [cache_storage] | |
78 | else: | |
79 | storage = cache_storage | |
80 | os.makedirs(storage[-1], exist_ok=True) | |
81 | self.storage = storage | |
82 | self.kwargs = target_options or {} | |
83 | self.cache_check = cache_check | |
84 | self.check_files = check_files | |
85 | self.expiry = expiry_time | |
86 | self.load_cache() | |
87 | if isinstance(target_protocol, AbstractFileSystem): | |
88 | self.fs = target_protocol | |
89 | self.protocol = self.fs.protocol | |
90 | else: | |
91 | self.protocol = target_protocol | |
92 | self.fs = filesystem(target_protocol, **self.kwargs) | |
93 | ||
94 | def __reduce_ex__(self, *_): | |
95 | return ( | |
96 | self.__class__, | |
97 | ( | |
98 | self.protocol, | |
99 | self.storage, | |
100 | self.cache_check, | |
101 | self.check_files, | |
102 | self.expiry, | |
103 | self.kwargs or None, | |
104 | ), | |
105 | ) | |
106 | ||
107 | def load_cache(self): | |
108 | """Read set of stored blocks from file""" | |
109 | cached_files = [] | |
110 | for storage in self.storage: | |
111 | fn = os.path.join(storage, "cache") | |
112 | if os.path.exists(fn): | |
113 | with open(fn, "rb") as f: | |
114 | # TODO: consolidate blocks here | |
115 | cached_files.append(pickle.load(f)) | |
116 | else: | |
117 | os.makedirs(storage, exist_ok=True) | |
118 | cached_files.append({}) | |
119 | self.cached_files = cached_files or [{}] | |
120 | self.last_cache = time.time() | |
121 | ||
122 | def save_cache(self): | |
123 | """Save set of stored blocks from file""" | |
124 | fn = os.path.join(self.storage[-1], "cache") | |
125 | # TODO: a file lock could be used to ensure file does not change | |
126 | # between re-read and write; but occasional duplicated reads ok. | |
127 | cache = self.cached_files[-1] | |
128 | if os.path.exists(fn): | |
129 | with open(fn, "rb") as f: | |
130 | cached_files = pickle.load(f) | |
131 | for k, c in cached_files.items(): | |
132 | if c["blocks"] is not True: | |
133 | if cache[k]["blocks"] is True: | |
134 | c["blocks"] = True | |
135 | else: | |
136 | c["blocks"] = set(c["blocks"]).union(cache[k]["blocks"]) | |
137 | else: | |
138 | cached_files = cache | |
139 | cache = {k: v.copy() for k, v in cached_files.items()} | |
140 | for c in cache.values(): | |
141 | if isinstance(c["blocks"], set): | |
142 | c["blocks"] = list(c["blocks"]) | |
143 | with open(fn + ".temp", "wb") as f: | |
144 | pickle.dump(cache, f) | |
145 | if os.path.exists(fn): | |
146 | os.remove(fn) | |
147 | os.rename(fn + ".temp", fn) | |
148 | ||
149 | def _check_cache(self): | |
150 | """Reload caches if time elapsed or any disappeared""" | |
151 | if not self.cache_check: | |
152 | # explicitly told not to bother checking | |
153 | return | |
154 | timecond = time.time() - self.last_cache > self.cache_check | |
155 | existcond = all(os.path.exists(storage) for storage in self.storage) | |
156 | if timecond or not existcond: | |
157 | self.load_cache() | |
158 | ||
159 | def _check_file(self, path): | |
160 | """Is path in cache and still valid""" | |
161 | self._check_cache() | |
162 | for storage, cache in zip(self.storage, self.cached_files): | |
163 | if path not in cache: | |
164 | continue | |
165 | detail = cache[path].copy() | |
166 | if self.check_files: | |
167 | if detail["uid"] != self.fs.ukey(path): | |
168 | continue | |
169 | if self.expiry: | |
170 | if detail["time"] - time.time() > self.expiry: | |
171 | continue | |
172 | fn = os.path.join(storage, detail["fn"]) | |
173 | if os.path.exists(fn): | |
174 | return detail, fn | |
175 | return False, None | |
176 | ||
177 | def _open(self, path, mode="rb", **kwargs): | |
178 | """Wrap the target _open | |
179 | ||
180 | If the whole file exists in the cache, just open it locally and | |
181 | return that. | |
182 | ||
183 | Otherwise, open the file on the target FS, and make it have a mmap | |
184 | cache pointing to the location which we determine, in our cache. | |
185 | The ``blocks`` instance is shared, so as the mmap cache instance | |
186 | updates, so does the entry in our ``cached_files`` attribute. | |
187 | We monkey-patch this file, so that when it closes, we call | |
188 | ``close_and_update`` to save the state of the blocks. | |
189 | """ | |
190 | path = self._strip_protocol(path) | |
191 | if not path.startswith(self.protocol): | |
192 | path = self.protocol + "://" + path | |
193 | if mode != "rb": | |
194 | return self.fs._open(path, mode=mode, **kwargs) | |
195 | detail, fn = self._check_file(path) | |
196 | if detail: | |
197 | # file is in cache | |
198 | hash, blocks = detail["fn"], detail["blocks"] | |
199 | if blocks is True: | |
200 | # stored file is complete | |
201 | logger.debug("Opening local copy of %s" % path) | |
202 | return open(fn, "rb") | |
203 | # TODO: action where partial file exists in read-only cache | |
204 | logger.debug("Opening partially cached copy of %s" % path) | |
205 | else: | |
206 | hash = hashlib.sha256(path.encode()).hexdigest() | |
207 | fn = os.path.join(self.storage[-1], hash) | |
208 | blocks = set() | |
209 | detail = { | |
210 | "fn": hash, | |
211 | "blocks": blocks, | |
212 | "time": time.time(), | |
213 | "uid": self.fs.ukey(path), | |
214 | } | |
215 | self.cached_files[-1][path] = detail | |
216 | logger.debug("Creating local sparse file for %s" % path) | |
217 | kwargs["cache_type"] = "none" | |
218 | kwargs["mode"] = mode | |
219 | ||
220 | # call target filesystems open | |
221 | f = self.fs._open(path, **kwargs) | |
222 | if "blocksize" in detail: | |
223 | if detail["blocksize"] != f.blocksize: | |
224 | raise ValueError( | |
225 | "Cached file must be reopened with same block" | |
226 | "size as original (old: %i, new %i)" | |
227 | "" % (detail["blocksize"], f.blocksize) | |
228 | ) | |
229 | else: | |
230 | detail["blocksize"] = f.blocksize | |
231 | f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks) | |
232 | close = f.close | |
233 | f.close = lambda: self.close_and_update(f, close) | |
234 | return f | |
235 | ||
236 | def close_and_update(self, f, close): | |
237 | """Called when a file is closing, so store the set of blocks""" | |
238 | if f.path.startswith(self.protocol): | |
239 | path = f.path | |
240 | else: | |
241 | path = self.protocol + "://" + f.path | |
242 | c = self.cached_files[-1][path] | |
243 | if c["blocks"] is not True and len(["blocks"]) * f.blocksize >= f.size: | |
244 | c["blocks"] = True | |
245 | self.save_cache() | |
246 | close() | |
247 | ||
248 | def __getattribute__(self, item): | |
249 | if item in [ | |
250 | "load_cache", | |
251 | "_open", | |
252 | "save_cache", | |
253 | "close_and_update", | |
254 | "__init__", | |
255 | "__getattribute__", | |
256 | "__reduce_ex__", | |
257 | "open", | |
258 | "cat", | |
259 | "get", | |
260 | "read_block", | |
261 | "tail", | |
262 | "head", | |
263 | "_check_file", | |
264 | "_check_cache", | |
265 | ]: | |
266 | # all the methods defined in this class. Note `open` here, since | |
267 | # it calls `_open`, but is actually in superclass | |
268 | return lambda *args, **kw: getattr(type(self), item)(self, *args, **kw) | |
269 | if item == "__class__": | |
270 | return type(self) | |
271 | d = object.__getattribute__(self, "__dict__") | |
272 | fs = d.get("fs", None) # fs is not immediately defined | |
273 | if item in d: | |
274 | return d[item] | |
275 | elif fs is not None: | |
276 | if item in fs.__dict__: | |
277 | # attribute of instance | |
278 | return fs.__dict__[item] | |
279 | # attributed belonging to the target filesystem | |
280 | cls = type(fs) | |
281 | m = getattr(cls, item) | |
282 | if inspect.isfunction(m) and ( | |
283 | not hasattr(m, "__self__") or m.__self__ is None | |
284 | ): | |
285 | # instance method | |
286 | return m.__get__(fs, cls) | |
287 | return m # class method or attribute | |
288 | else: | |
289 | # attributes of the superclass, while target is being set up | |
290 | return super().__getattribute__(item) | |
291 | ||
292 | ||
293 | class WholeFileCacheFileSystem(CachingFileSystem): | |
294 | """Caches whole remote files on first access | |
295 | ||
296 | This class is intended as a layer over any other file system, and | |
297 | will make a local copy of each file accessed, so that all subsequent | |
298 | reads are local. This is similar to ``CachingFileSystem``, but without | |
299 | the block-wise functionality and so can work even when sparse files | |
300 | are not allowed. See its docstring for definition of the init | |
301 | arguments. | |
302 | ||
303 | The class still needs access to the remote store for listing files, | |
304 | and may refresh cached files. | |
305 | """ | |
306 | ||
307 | protocol = "filecache" | |
308 | ||
309 | def _open(self, path, mode="rb", **kwargs): | |
310 | path = self._strip_protocol(path) | |
311 | if not path.startswith(self.protocol): | |
312 | path = self.protocol + "://" + path | |
313 | if mode != "rb": | |
314 | return self.fs._open(path, mode=mode, **kwargs) | |
315 | detail, fn = self._check_file(path) | |
316 | if detail: | |
317 | hash, blocks = detail["fn"], detail["blocks"] | |
318 | if blocks is True: | |
319 | logger.debug("Opening local copy of %s" % path) | |
320 | return open(fn, "rb") | |
321 | else: | |
322 | raise ValueError( | |
323 | "Attempt to open partially cached file %s" | |
324 | "as a wholly cached file" % path | |
325 | ) | |
326 | else: | |
327 | hash = hashlib.sha256(path.encode()).hexdigest() | |
328 | fn = os.path.join(self.storage[-1], hash) | |
329 | blocks = True | |
330 | detail = { | |
331 | "fn": hash, | |
332 | "blocks": blocks, | |
333 | "time": time.time(), | |
334 | "uid": self.fs.ukey(path), | |
335 | } | |
336 | self.cached_files[-1][path] = detail | |
337 | logger.debug("Copying %s to local cache" % path) | |
338 | kwargs["mode"] = mode | |
339 | ||
340 | # call target filesystems open | |
341 | # TODO: why not just use fs.get ?? | |
342 | f = self.fs._open(path, **kwargs) | |
343 | with open(fn, "wb") as f2: | |
344 | if isinstance(f, AbstractBufferedFile): | |
345 | # want no type of caching if just downloading whole thing | |
346 | f.cache = BaseCache(0, f.cache.fetcher, f.size) | |
347 | if getattr(f, "blocksize", 0) and f.size: | |
348 | # opportunity to parallelise here | |
349 | data = True | |
350 | while data: | |
351 | data = f.read(f.blocksize) | |
352 | f2.write(data) | |
353 | else: | |
354 | # this only applies to HTTP, should instead use streaming | |
355 | f2.write(f.read()) | |
356 | self.save_cache() | |
357 | return self._open(path, mode) |
0 | from distributed.worker import get_worker | |
1 | from distributed.client import _get_global_client | |
2 | import dask | |
3 | from fsspec.spec import AbstractFileSystem, AbstractBufferedFile | |
4 | from fsspec import filesystem | |
5 | ||
6 | ||
7 | def make_instance(cls, args, kwargs): | |
8 | inst = cls(*args, **kwargs) | |
9 | inst._determine_worker() | |
10 | return inst | |
11 | ||
12 | ||
13 | class DaskWorkerFileSystem(AbstractFileSystem): | |
14 | """View files accessible to a worker as any other remote file-system | |
15 | ||
16 | When instances are run on the worker, uses the real filesystem. When | |
17 | run on the client, they call the worker to provide information or data. | |
18 | ||
19 | **Warning** this implementation is experimental, and read-only for now. | |
20 | """ | |
21 | ||
22 | def __init__(self, remote_protocol, remote_options=None, **kwargs): | |
23 | super().__init__(**kwargs) | |
24 | self.protocol = remote_protocol | |
25 | self.remote_options = remote_options | |
26 | self.worker = None | |
27 | self.client = None | |
28 | self.fs = None | |
29 | self._determine_worker() | |
30 | ||
31 | def _determine_worker(self): | |
32 | try: | |
33 | get_worker() | |
34 | self.worker = True | |
35 | self.fs = filesystem(self.protocol, **(self.remote_options or {})) | |
36 | except ValueError: | |
37 | self.worker = False | |
38 | self.client = _get_global_client() | |
39 | self.rfs = dask.delayed(self) | |
40 | ||
41 | def __reduce__(self): | |
42 | return make_instance, (type(self), self.storage_args, self.storage_options) | |
43 | ||
44 | def mkdir(self, *args, **kwargs): | |
45 | if self.worker: | |
46 | self.fs.mkdir(*args, **kwargs) | |
47 | else: | |
48 | self.rfs.mkdir(*args, **kwargs).compute() | |
49 | ||
50 | def rm(self, *args, **kwargs): | |
51 | if self.worker: | |
52 | self.fs.rm(*args, **kwargs) | |
53 | else: | |
54 | self.rfs.rm(*args, **kwargs).compute() | |
55 | ||
56 | def copy(self, *args, **kwargs): | |
57 | if self.worker: | |
58 | self.fs.copy(*args, **kwargs) | |
59 | else: | |
60 | self.rfs.copy(*args, **kwargs).compute() | |
61 | ||
62 | def mv(self, *args, **kwargs): | |
63 | if self.worker: | |
64 | self.fs.mv(*args, **kwargs) | |
65 | else: | |
66 | self.rfs.mv(*args, **kwargs).compute() | |
67 | ||
68 | def ls(self, *args, **kwargs): | |
69 | if self.worker: | |
70 | return self.fs.ls(*args, **kwargs) | |
71 | else: | |
72 | return self.rfs.ls(*args, **kwargs).compute() | |
73 | ||
74 | def _open(self, path, mode="rb", **kwargs): | |
75 | if self.worker: | |
76 | return self.fs._open(path, mode=mode) | |
77 | else: | |
78 | return DaskFile(self, path, mode, **kwargs) | |
79 | ||
80 | def fetch_range(self, path, mode, start, end): | |
81 | if self.worker: | |
82 | with self._open(path, mode) as f: | |
83 | f.seek(start) | |
84 | return f.read(end - start) | |
85 | else: | |
86 | return self.rfs.fetch_range(path, mode, start, end).compute() | |
87 | ||
88 | ||
89 | class DaskFile(AbstractBufferedFile): | |
90 | def __init__( | |
91 | self, | |
92 | fs, | |
93 | path, | |
94 | mode="rb", | |
95 | block_size="default", | |
96 | autocommit=True, | |
97 | cache_type="bytes", | |
98 | **kwargs | |
99 | ): | |
100 | super().__init__( | |
101 | fs, | |
102 | path, | |
103 | mode=mode, | |
104 | block_size=block_size, | |
105 | autocommit=autocommit, | |
106 | cache_type=cache_type, | |
107 | **kwargs | |
108 | ) | |
109 | ||
110 | def _upload_chunk(self, final=False): | |
111 | pass | |
112 | ||
113 | def _initiate_upload(self): | |
114 | """ Create remote file/upload """ | |
115 | pass | |
116 | ||
117 | def _fetch_range(self, start, end): | |
118 | """Get the specified set of bytes from remote""" | |
119 | return self.fs.fetch_range(self.path, self.mode, start, end) |
0 | from ftplib import FTP, Error, error_perm | |
1 | from socket import timeout | |
2 | import uuid | |
3 | from ..spec import AbstractBufferedFile, AbstractFileSystem | |
4 | from ..utils import infer_storage_options | |
5 | ||
6 | ||
7 | class FTPFileSystem(AbstractFileSystem): | |
8 | """A filesystem over classic """ | |
9 | ||
10 | root_marker = "/" | |
11 | cachable = False | |
12 | ||
13 | def __init__( | |
14 | self, | |
15 | host, | |
16 | port=21, | |
17 | username=None, | |
18 | password=None, | |
19 | acct=None, | |
20 | block_size=None, | |
21 | tempdir="/tmp", | |
22 | timeout=30, | |
23 | **kwargs | |
24 | ): | |
25 | """ | |
26 | You can use _get_kwargs_from_urls to get some kwargs from | |
27 | a reasonable FTP url. | |
28 | ||
29 | Authentication will be anonymous if username/password are not | |
30 | given. | |
31 | ||
32 | Parameters | |
33 | ---------- | |
34 | host: str | |
35 | The remote server name/ip to connect to | |
36 | port: int | |
37 | Port to connect with | |
38 | username: str or None | |
39 | If authenticating, the user's identifier | |
40 | password: str of None | |
41 | User's password on the server, if using | |
42 | acct: str or None | |
43 | Some servers also need an "account" string for auth | |
44 | block_size: int or None | |
45 | If given, the read-ahead or write buffer size. | |
46 | tempdir: str | |
47 | Directory on remote to put temporary files when in a transaction | |
48 | """ | |
49 | super(FTPFileSystem, self).__init__(**kwargs) | |
50 | self.host = host | |
51 | self.port = port | |
52 | self.tempdir = tempdir | |
53 | self.cred = username, password, acct | |
54 | self.timeout = timeout | |
55 | if block_size is not None: | |
56 | self.blocksize = block_size | |
57 | else: | |
58 | self.blocksize = 2 ** 16 | |
59 | self._connect() | |
60 | ||
61 | def _connect(self): | |
62 | self.ftp = FTP(timeout=self.timeout) | |
63 | self.ftp.connect(self.host, self.port) | |
64 | self.ftp.login(*self.cred) | |
65 | ||
66 | @classmethod | |
67 | def _strip_protocol(cls, path): | |
68 | return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/") | |
69 | ||
70 | @staticmethod | |
71 | def _get_kwargs_from_urls(urlpath): | |
72 | out = infer_storage_options(urlpath) | |
73 | out.pop("path", None) | |
74 | out.pop("protocol", None) | |
75 | return out | |
76 | ||
77 | def invalidate_cache(self, path=None): | |
78 | if path is not None: | |
79 | self.dircache.pop(path, None) | |
80 | else: | |
81 | self.dircache.clear() | |
82 | ||
83 | def ls(self, path, detail=True): | |
84 | path = self._strip_protocol(path) | |
85 | out = [] | |
86 | if path not in self.dircache: | |
87 | try: | |
88 | try: | |
89 | out = [ | |
90 | (fn, details) | |
91 | for (fn, details) in self.ftp.mlsd(path) | |
92 | if fn not in [".", ".."] | |
93 | and details["type"] not in ["pdir", "cdir"] | |
94 | ] | |
95 | except error_perm: | |
96 | out = _mlsd2(self.ftp, path) # Not platform independent | |
97 | for fn, details in out: | |
98 | if path == "/": | |
99 | path = "" # just for forming the names, below | |
100 | details["name"] = "/".join([path, fn.lstrip("/")]) | |
101 | if details["type"] == "file": | |
102 | details["size"] = int(details["size"]) | |
103 | else: | |
104 | details["size"] = 0 | |
105 | self.dircache[path] = out | |
106 | except Error: | |
107 | try: | |
108 | info = self.info(path) | |
109 | if info["type"] == "file": | |
110 | out = [(path, info)] | |
111 | except (Error, IndexError): | |
112 | raise FileNotFoundError | |
113 | files = self.dircache.get(path, out) | |
114 | if not detail: | |
115 | return sorted([fn for fn, details in files]) | |
116 | return [details for fn, details in files] | |
117 | ||
118 | def info(self, path, **kwargs): | |
119 | # implement with direct method | |
120 | path = self._strip_protocol(path) | |
121 | files = self.ls(self._parent(path).lstrip("/"), True) | |
122 | try: | |
123 | out = [f for f in files if f["name"] == path][0] | |
124 | except IndexError: | |
125 | raise FileNotFoundError(path) | |
126 | return out | |
127 | ||
128 | def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs): | |
129 | path = self._strip_protocol(path) | |
130 | block_size = block_size or self.blocksize | |
131 | return FTPFile( | |
132 | self, | |
133 | path, | |
134 | mode=mode, | |
135 | block_size=block_size, | |
136 | tempdir=self.tempdir, | |
137 | autocommit=autocommit, | |
138 | ) | |
139 | ||
140 | def _rm(self, path): | |
141 | path = self._strip_protocol(path) | |
142 | self.ftp.delete(path) | |
143 | self.invalidate_cache(path.rsplit("/", 1)[0]) | |
144 | ||
145 | def mkdir(self, path, **kwargs): | |
146 | path = self._strip_protocol(path) | |
147 | self.ftp.mkd(path) | |
148 | ||
149 | def rmdir(self, path): | |
150 | path = self._strip_protocol(path) | |
151 | self.ftp.rmd(path) | |
152 | ||
153 | def mv(self, path1, path2, **kwargs): | |
154 | path1 = self._strip_protocol(path1) | |
155 | path2 = self._strip_protocol(path2) | |
156 | self.ftp.rename(path1, path2) | |
157 | self.invalidate_cache(self._parent(path1)) | |
158 | self.invalidate_cache(self._parent(path2)) | |
159 | ||
160 | def __del__(self): | |
161 | self.ftp.close() | |
162 | ||
163 | ||
164 | class TransferDone(Exception): | |
165 | """Internal exception to break out of transfer""" | |
166 | ||
167 | pass | |
168 | ||
169 | ||
170 | class FTPFile(AbstractBufferedFile): | |
171 | """Interact with a remote FTP file with read/write buffering""" | |
172 | ||
173 | def __init__(self, fs, path, **kwargs): | |
174 | super().__init__(fs, path, **kwargs) | |
175 | if kwargs.get("autocommit", False) is False: | |
176 | self.target = self.path | |
177 | self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())]) | |
178 | ||
179 | def commit(self): | |
180 | self.fs.mv(self.path, self.target) | |
181 | ||
182 | def discard(self): | |
183 | self.fs.rm(self.path) | |
184 | ||
185 | def _fetch_range(self, start, end): | |
186 | """Get bytes between given byte limits | |
187 | ||
188 | Implemented by raising an exception in the fetch callback when the | |
189 | number of bytes received reaches the requested amount. | |
190 | ||
191 | Will fail if the server does not respect the REST command on | |
192 | retrieve requests. | |
193 | """ | |
194 | out = [] | |
195 | total = [0] | |
196 | ||
197 | def callback(x): | |
198 | total[0] += len(x) | |
199 | if total[0] > end - start: | |
200 | out.append(x[: (end - start) - total[0]]) | |
201 | raise TransferDone | |
202 | else: | |
203 | out.append(x) | |
204 | ||
205 | if total[0] == end - start: | |
206 | raise TransferDone | |
207 | ||
208 | try: | |
209 | self.fs.ftp.retrbinary( | |
210 | "RETR %s" % self.path, | |
211 | blocksize=self.blocksize, | |
212 | rest=start, | |
213 | callback=callback, | |
214 | ) | |
215 | except TransferDone: | |
216 | try: | |
217 | self.fs.ftp.abort() | |
218 | self.fs.ftp.voidresp() | |
219 | except timeout: | |
220 | self.fs._connect() | |
221 | return b"".join(out) | |
222 | ||
223 | def _upload_chunk(self, final=False): | |
224 | self.buffer.seek(0) | |
225 | self.fs.ftp.storbinary( | |
226 | "STOR " + self.path, self.buffer, blocksize=self.blocksize, rest=self.offset | |
227 | ) | |
228 | return True | |
229 | ||
230 | ||
231 | def _mlsd2(ftp, path="."): | |
232 | """ | |
233 | Fall back to using `dir` instead of `mlsd` if not supported. | |
234 | ||
235 | This parses a Linux style `ls -l` response to `dir`, but the response may | |
236 | be platform dependent. | |
237 | ||
238 | Parameters | |
239 | ---------- | |
240 | ftp: ftplib.FTP | |
241 | path: str | |
242 | Expects to be given path, but defaults to ".". | |
243 | """ | |
244 | lines = [] | |
245 | minfo = [] | |
246 | ftp.dir(path, lines.append) | |
247 | for line in lines: | |
248 | line = line.split() | |
249 | this = ( | |
250 | line[-1], | |
251 | { | |
252 | "modify": " ".join(line[5:8]), | |
253 | "unix.owner": line[2], | |
254 | "unix.group": line[3], | |
255 | "unix.mode": line[0], | |
256 | "size": line[4], | |
257 | }, | |
258 | ) | |
259 | if "d" == this[1]["unix.mode"][0]: | |
260 | this[1]["type"] = "dir" | |
261 | else: | |
262 | this[1]["type"] = "file" | |
263 | minfo.append(this) | |
264 | return minfo |
0 | import io | |
1 | import requests | |
2 | from ..spec import AbstractFileSystem | |
3 | ||
4 | ||
5 | class GithubFileSystem(AbstractFileSystem): | |
6 | """[Experimental] interface to files in github | |
7 | ||
8 | An instance of this class provides the files residing within a remote github | |
9 | repository. You may specify a point in the repos history, by SHA, branch | |
10 | or tag (default is current master). | |
11 | ||
12 | Given that code files tend to be small, and that github does not support | |
13 | retrieving partial content, we always fetch whole files. | |
14 | """ | |
15 | ||
16 | url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" | |
17 | rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" | |
18 | protocol = "github" | |
19 | ||
20 | def __init__(self, org, repo, sha="master", **kwargs): | |
21 | super().__init__(**kwargs) | |
22 | self.org = org | |
23 | self.repo = repo | |
24 | self.root = sha | |
25 | self.ls("") | |
26 | ||
27 | def ls(self, path, detail=False, sha=None, **kwargs): | |
28 | if path == "": | |
29 | sha = self.root | |
30 | if sha is None: | |
31 | parts = path.rstrip("/").split("/") | |
32 | so_far = "" | |
33 | sha = self.root | |
34 | for part in parts: | |
35 | out = self.ls(so_far, True, sha=sha) | |
36 | so_far += "/" + part if so_far else part | |
37 | out = [o for o in out if o["name"] == so_far][0] | |
38 | if out["type"] == "file": | |
39 | if detail: | |
40 | return [out] | |
41 | else: | |
42 | return path | |
43 | sha = out["sha"] | |
44 | if path not in self.dircache: | |
45 | r = requests.get(self.url.format(org=self.org, repo=self.repo, sha=sha)) | |
46 | self.dircache[path] = [ | |
47 | { | |
48 | "name": path + "/" + f["path"] if path else f["path"], | |
49 | "mode": f["mode"], | |
50 | "type": {"blob": "file", "tree": "directory"}[f["type"]], | |
51 | "size": f.get("size", 0), | |
52 | "sha": f["sha"], | |
53 | } | |
54 | for f in r.json()["tree"] | |
55 | ] | |
56 | if detail: | |
57 | return self.dircache[path] | |
58 | else: | |
59 | return sorted([f["name"] for f in self.dircache[path]]) | |
60 | ||
61 | def _open(self, path, mode="rb", **kwargs): | |
62 | if mode != "rb": | |
63 | raise NotImplementedError | |
64 | url = self.rurl.format(org=self.org, repo=self.repo, path=path, sha=self.root) | |
65 | r = requests.get(url) | |
66 | return io.BytesIO(r.content) |
0 | from ..spec import AbstractFileSystem | |
1 | from ..utils import infer_storage_options | |
2 | from pyarrow.hdfs import HadoopFileSystem | |
3 | ||
4 | ||
5 | class PyArrowHDFS(AbstractFileSystem): | |
6 | """Adapted version of Arrow's HadoopFileSystem | |
7 | ||
8 | This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which | |
9 | passes on all calls to the underlying class. | |
10 | """ | |
11 | ||
12 | def __init__( | |
13 | self, | |
14 | host="default", | |
15 | port=0, | |
16 | user=None, | |
17 | kerb_ticket=None, | |
18 | driver="libhdfs", | |
19 | extra_conf=None, | |
20 | **kwargs | |
21 | ): | |
22 | """ | |
23 | ||
24 | Parameters | |
25 | ---------- | |
26 | host: str | |
27 | Hostname, IP or "default" to try to read from Hadoop config | |
28 | port: int | |
29 | Port to connect on, or default from Hadoop config if 0 | |
30 | user: str or None | |
31 | If given, connect as this username | |
32 | kerb_ticket: str or None | |
33 | If given, use this ticket for authentication | |
34 | driver: 'libhdfs' or 'libhdfs3' | |
35 | Binary driver; libhdfs if the JNI library and default | |
36 | extra_conf: None or dict | |
37 | Passed on to HadoopFileSystem | |
38 | """ | |
39 | if self._cached: | |
40 | return | |
41 | AbstractFileSystem.__init__(self, **kwargs) | |
42 | self.pars = (host, port, user, kerb_ticket, driver, extra_conf) | |
43 | self.pahdfs = HadoopFileSystem( | |
44 | host=host, | |
45 | port=port, | |
46 | user=user, | |
47 | kerb_ticket=kerb_ticket, | |
48 | driver=driver, | |
49 | extra_conf=extra_conf, | |
50 | ) | |
51 | ||
52 | def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs): | |
53 | """ | |
54 | ||
55 | Parameters | |
56 | ---------- | |
57 | path: str | |
58 | Location of file; should start with '/' | |
59 | mode: str | |
60 | block_size: int | |
61 | Hadoop block size, e.g., 2**26 | |
62 | autocommit: True | |
63 | Transactions are not yet implemented for HDFS; errors if not True | |
64 | kwargs: dict or None | |
65 | Hadoop config parameters | |
66 | ||
67 | Returns | |
68 | ------- | |
69 | HDFSFile file-like instance | |
70 | """ | |
71 | if not autocommit: | |
72 | raise NotImplementedError | |
73 | return HDFSFile(self, path, mode, block_size, **kwargs) | |
74 | ||
75 | def __reduce_ex__(self, protocol): | |
76 | return PyArrowHDFS, self.pars | |
77 | ||
78 | def ls(self, path, detail=True): | |
79 | out = self.pahdfs.ls(path, detail) | |
80 | if detail: | |
81 | for p in out: | |
82 | p["type"] = p["kind"] | |
83 | p["name"] = self._strip_protocol(p["name"]) | |
84 | else: | |
85 | out = [self._strip_protocol(p) for p in out] | |
86 | return out | |
87 | ||
88 | @staticmethod | |
89 | def _get_kwargs_from_urls(paths): | |
90 | ops = infer_storage_options(paths) | |
91 | out = {} | |
92 | if ops.get("host", None): | |
93 | out["host"] = ops["host"] | |
94 | if ops.get("username", None): | |
95 | out["user"] = ops["username"] | |
96 | if ops.get("port", None): | |
97 | out["port"] = ops["port"] | |
98 | return out | |
99 | ||
100 | @classmethod | |
101 | def _strip_protocol(cls, path): | |
102 | ops = infer_storage_options(path) | |
103 | return ops["path"] | |
104 | ||
105 | def __getattribute__(self, item): | |
106 | if item in [ | |
107 | "_open", | |
108 | "__init__", | |
109 | "__getattribute__", | |
110 | "__reduce_ex__", | |
111 | "open", | |
112 | "ls", | |
113 | "makedirs", | |
114 | ]: | |
115 | # all the methods defined in this class. Note `open` here, since | |
116 | # it calls `_open`, but is actually in superclass | |
117 | return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw) | |
118 | if item == "__class__": | |
119 | return PyArrowHDFS | |
120 | d = object.__getattribute__(self, "__dict__") | |
121 | pahdfs = d.get("pahdfs", None) # fs is not immediately defined | |
122 | if pahdfs is not None and item in [ | |
123 | "chmod", | |
124 | "chown", | |
125 | "user", | |
126 | "df", | |
127 | "disk_usage", | |
128 | "download", | |
129 | "driver", | |
130 | "exists", | |
131 | "extra_conf", | |
132 | "get_capacity", | |
133 | "get_space_used", | |
134 | "host", | |
135 | "is_open", | |
136 | "kerb_ticket", | |
137 | "strip_protocol", | |
138 | "mkdir", | |
139 | "mv", | |
140 | "port", | |
141 | "get_capacity", | |
142 | "get_space_used", | |
143 | "df", | |
144 | "chmod", | |
145 | "chown", | |
146 | "disk_usage", | |
147 | "download", | |
148 | "upload", | |
149 | "_get_kwargs_from_urls", | |
150 | "read_parquet", | |
151 | "rm", | |
152 | "stat", | |
153 | "upload", | |
154 | ]: | |
155 | return getattr(pahdfs, item) | |
156 | else: | |
157 | # attributes of the superclass, while target is being set up | |
158 | return super().__getattribute__(item) | |
159 | ||
160 | ||
161 | class HDFSFile(object): | |
162 | """Wrapper around arrow's HdfsFile | |
163 | ||
164 | Allows seek beyond EOF and (eventually) commit/discard | |
165 | """ | |
166 | ||
167 | def __init__(self, fs, path, mode, block_size, **kwargs): | |
168 | self.fs = fs | |
169 | self.path = path | |
170 | self.mode = mode | |
171 | self.block_size = block_size | |
172 | self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs) | |
173 | if self.fh.readable(): | |
174 | self.seek_size = self.size() | |
175 | ||
176 | def seek(self, loc, whence=0): | |
177 | if whence == 0 and self.readable(): | |
178 | loc = min(loc, self.seek_size) | |
179 | return self.fh.seek(loc, whence) | |
180 | ||
181 | def __getattr__(self, item): | |
182 | return getattr(self.fh, item) | |
183 | ||
184 | def __reduce_ex__(self, protocol): | |
185 | return HDFSFile, (self.fs, self.path, self.mode, self.block_size) | |
186 | ||
187 | def __enter__(self): | |
188 | return self | |
189 | ||
190 | def __exit__(self, exc_type, exc_val, exc_tb): | |
191 | self.close() |
0 | from __future__ import print_function, division, absolute_import | |
1 | ||
2 | import re | |
3 | import requests | |
4 | from urllib.parse import urlparse | |
5 | from fsspec import AbstractFileSystem | |
6 | from fsspec.spec import AbstractBufferedFile | |
7 | from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE | |
8 | ||
9 | # https://stackoverflow.com/a/15926317/3821154 | |
10 | ex = re.compile(r"""<a\s+(?:[^>]*?\s+)?href=(["'])(.*?)\1""") | |
11 | ex2 = re.compile(r"""(http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""") | |
12 | ||
13 | ||
14 | class HTTPFileSystem(AbstractFileSystem): | |
15 | """ | |
16 | Simple File-System for fetching data via HTTP(S) | |
17 | ||
18 | ``ls()`` is implemented by loading the parent page and doing a regex | |
19 | match on the result. If simple_link=True, anything of the form | |
20 | "http(s)://server.com/stuff?thing=other"; otherwise only links within | |
21 | HTML href tags will be used. | |
22 | """ | |
23 | ||
24 | sep = "/" | |
25 | ||
26 | def __init__( | |
27 | self, | |
28 | simple_links=True, | |
29 | block_size=None, | |
30 | same_scheme=True, | |
31 | size_policy=None, | |
32 | **storage_options | |
33 | ): | |
34 | """ | |
35 | Parameters | |
36 | ---------- | |
37 | block_size: int | |
38 | Blocks to read bytes; if 0, will default to raw requests file-like | |
39 | objects instead of HTTPFile instances | |
40 | simple_links: bool | |
41 | If True, will consider both HTML <a> tags and anything that looks | |
42 | like a URL; if False, will consider only the former. | |
43 | same_scheme: True | |
44 | When doing ls/glob, if this is True, only consider paths that have | |
45 | http/https matching the input URLs. | |
46 | size_policy: this argument is deprecated | |
47 | storage_options: key-value | |
48 | May be credentials, e.g., `{'auth': ('username', 'pword')}` or any | |
49 | other parameters passed on to requests | |
50 | """ | |
51 | AbstractFileSystem.__init__(self) | |
52 | self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE | |
53 | self.simple_links = simple_links | |
54 | self.same_schema = same_scheme | |
55 | self.kwargs = storage_options | |
56 | self.session = requests.Session() | |
57 | ||
58 | @classmethod | |
59 | def _strip_protocol(cls, path): | |
60 | """ For HTTP, we always want to keep the full URL | |
61 | """ | |
62 | return path | |
63 | ||
64 | # TODO: override get | |
65 | ||
66 | def ls(self, url, detail=True): | |
67 | # ignoring URL-encoded arguments | |
68 | r = self.session.get(url, **self.kwargs) | |
69 | if self.simple_links: | |
70 | links = ex2.findall(r.text) + ex.findall(r.text) | |
71 | else: | |
72 | links = ex.findall(r.text) | |
73 | out = set() | |
74 | parts = urlparse(url) | |
75 | for l in links: | |
76 | if isinstance(l, tuple): | |
77 | l = l[1] | |
78 | if l.startswith("http"): | |
79 | if self.same_schema: | |
80 | if l.split(":", 1)[0] == url.split(":", 1)[0]: | |
81 | out.add(l) | |
82 | elif l.replace("https", "http").startswith( | |
83 | url.replace("https", "http") | |
84 | ): | |
85 | # allowed to cross http <-> https | |
86 | out.add(l) | |
87 | elif l.startswith("/") and len(l) > 1: | |
88 | out.add(parts.scheme + "://" + parts.netloc + l) | |
89 | else: | |
90 | if l not in ["..", "../"]: | |
91 | # Ignore FTP-like "parent" | |
92 | out.add("/".join([url.rstrip("/"), l.lstrip("/")])) | |
93 | if not out and url.endswith("/"): | |
94 | return self.ls(url.rstrip("/"), detail=True) | |
95 | if detail: | |
96 | return [ | |
97 | { | |
98 | "name": u, | |
99 | "size": None, | |
100 | "type": "directory" if u.endswith("/") else "file", | |
101 | } | |
102 | for u in out | |
103 | ] | |
104 | else: | |
105 | return list(sorted(out)) | |
106 | ||
107 | def cat(self, url): | |
108 | r = requests.get(url, **self.kwargs) | |
109 | r.raise_for_status() | |
110 | return r.content | |
111 | ||
112 | def mkdirs(self, url): | |
113 | """Make any intermediate directories to make path writable""" | |
114 | raise NotImplementedError | |
115 | ||
116 | def exists(self, path): | |
117 | kwargs = self.kwargs.copy() | |
118 | kwargs["stream"] = True | |
119 | try: | |
120 | r = self.session.get(path, **kwargs) | |
121 | r.close() | |
122 | return r.ok | |
123 | except requests.HTTPError: | |
124 | return False | |
125 | ||
126 | def _open(self, url, mode="rb", block_size=None, cache_options=None, **kwargs): | |
127 | """Make a file-like object | |
128 | ||
129 | Parameters | |
130 | ---------- | |
131 | url: str | |
132 | Full URL with protocol | |
133 | mode: string | |
134 | must be "rb" | |
135 | block_size: int or None | |
136 | Bytes to download in one request; use instance value if None. If | |
137 | zero, will return a streaming Requests file-like instance. | |
138 | kwargs: key-value | |
139 | Any other parameters, passed to requests calls | |
140 | """ | |
141 | if mode != "rb": | |
142 | raise NotImplementedError | |
143 | block_size = block_size if block_size is not None else self.block_size | |
144 | kw = self.kwargs.copy() | |
145 | kw.update(kwargs) | |
146 | kw.pop("autocommit", None) | |
147 | if block_size: | |
148 | return HTTPFile( | |
149 | self, url, self.session, block_size, cache_options=cache_options, **kw | |
150 | ) | |
151 | else: | |
152 | kw["stream"] = True | |
153 | r = self.session.get(url, **kw) | |
154 | r.raise_for_status() | |
155 | r.raw.decode_content = True | |
156 | return r.raw | |
157 | ||
158 | def ukey(self, url): | |
159 | """Unique identifier; assume HTTP files are static, unchanging""" | |
160 | return tokenize(url, self.kwargs, self.protocol) | |
161 | ||
162 | def info(self, url, **kwargs): | |
163 | """Get info of URL | |
164 | ||
165 | Tries to access location via HEAD, and then GET methods, but does | |
166 | not fetch the data. | |
167 | ||
168 | It is possible that the server does not supply any size information, in | |
169 | which case size will be given as None (and certain operations on the | |
170 | corresponding file will not work). | |
171 | """ | |
172 | size = False | |
173 | for policy in ["head", "get"]: | |
174 | try: | |
175 | size = file_size(url, self.session, policy, **self.kwargs) | |
176 | if size: | |
177 | break | |
178 | except Exception: | |
179 | pass | |
180 | else: | |
181 | # get failed, so conclude URL does not exist | |
182 | if size is False: | |
183 | raise FileNotFoundError(url) | |
184 | return {"name": url, "size": size or None, "type": "file"} | |
185 | ||
186 | ||
187 | class HTTPFile(AbstractBufferedFile): | |
188 | """ | |
189 | A file-like object pointing to a remove HTTP(S) resource | |
190 | ||
191 | Supports only reading, with read-ahead of a predermined block-size. | |
192 | ||
193 | In the case that the server does not supply the filesize, only reading of | |
194 | the complete file in one go is supported. | |
195 | ||
196 | Parameters | |
197 | ---------- | |
198 | url: str | |
199 | Full URL of the remote resource, including the protocol | |
200 | session: requests.Session or None | |
201 | All calls will be made within this session, to avoid restarting | |
202 | connections where the server allows this | |
203 | block_size: int or None | |
204 | The amount of read-ahead to do, in bytes. Default is 5MB, or the value | |
205 | configured for the FileSystem creating this file | |
206 | size: None or int | |
207 | If given, this is the size of the file in bytes, and we don't attempt | |
208 | to call the server to find the value. | |
209 | kwargs: all other key-values are passed to requests calls. | |
210 | """ | |
211 | ||
212 | def __init__( | |
213 | self, | |
214 | fs, | |
215 | url, | |
216 | session=None, | |
217 | block_size=None, | |
218 | mode="rb", | |
219 | cache_type="bytes", | |
220 | cache_options=None, | |
221 | size=None, | |
222 | **kwargs | |
223 | ): | |
224 | if mode != "rb": | |
225 | raise NotImplementedError("File mode not supported") | |
226 | self.url = url | |
227 | self.session = session if session is not None else requests.Session() | |
228 | if size is not None: | |
229 | self.details = {"name": url, "size": size, "type": "file"} | |
230 | super().__init__( | |
231 | fs=fs, | |
232 | path=url, | |
233 | mode=mode, | |
234 | block_size=block_size, | |
235 | cache_type=cache_type, | |
236 | cache_options=cache_options, | |
237 | **kwargs | |
238 | ) | |
239 | self.cache.size = self.size or self.blocksize | |
240 | ||
241 | def read(self, length=-1): | |
242 | """Read bytes from file | |
243 | ||
244 | Parameters | |
245 | ---------- | |
246 | length: int | |
247 | Read up to this many bytes. If negative, read all content to end of | |
248 | file. If the server has not supplied the filesize, attempting to | |
249 | read only part of the data will raise a ValueError. | |
250 | """ | |
251 | if ( | |
252 | (length < 0 and self.loc == 0) | |
253 | or (length > (self.size or length)) # explicit read all | |
254 | or ( # read more than there is | |
255 | self.size and self.size < self.blocksize | |
256 | ) # all fits in one block anyway | |
257 | ): | |
258 | self._fetch_all() | |
259 | if self.size is None: | |
260 | if length < 0: | |
261 | self._fetch_all() | |
262 | else: | |
263 | length = min(self.size - self.loc, length) | |
264 | return super().read(length) | |
265 | ||
266 | def _fetch_all(self): | |
267 | """Read whole file in one shot, without caching | |
268 | ||
269 | This is only called when position is still at zero, | |
270 | and read() is called without a byte-count. | |
271 | """ | |
272 | if not isinstance(self.cache, AllBytes): | |
273 | r = self.session.get(self.url, **self.kwargs) | |
274 | r.raise_for_status() | |
275 | out = r.content | |
276 | self.cache = AllBytes(out) | |
277 | self.size = len(out) | |
278 | ||
279 | def _fetch_range(self, start, end): | |
280 | """Download a block of data | |
281 | ||
282 | The expectation is that the server returns only the requested bytes, | |
283 | with HTTP code 206. If this is not the case, we first check the headers, | |
284 | and then stream the output - if the data size is bigger than we | |
285 | requested, an exception is raised. | |
286 | """ | |
287 | kwargs = self.kwargs.copy() | |
288 | headers = kwargs.pop("headers", {}) | |
289 | headers["Range"] = "bytes=%i-%i" % (start, end - 1) | |
290 | r = self.session.get(self.url, headers=headers, stream=True, **kwargs) | |
291 | if r.status_code == 416: | |
292 | # range request outside file | |
293 | return b"" | |
294 | r.raise_for_status() | |
295 | if r.status_code == 206: | |
296 | # partial content, as expected | |
297 | out = r.content | |
298 | elif "Content-Length" in r.headers: | |
299 | cl = int(r.headers["Content-Length"]) | |
300 | if cl <= end - start: | |
301 | # data size OK | |
302 | out = r.content | |
303 | else: | |
304 | raise ValueError( | |
305 | "Got more bytes (%i) than requested (%i)" % (cl, end - start) | |
306 | ) | |
307 | else: | |
308 | cl = 0 | |
309 | out = [] | |
310 | for chunk in r.iter_content(chunk_size=2 ** 20): | |
311 | # data size unknown, let's see if it goes too big | |
312 | if chunk: | |
313 | out.append(chunk) | |
314 | cl += len(chunk) | |
315 | if cl > end - start: | |
316 | raise ValueError( | |
317 | "Got more bytes so far (>%i) than requested (%i)" | |
318 | % (cl, end - start) | |
319 | ) | |
320 | else: | |
321 | break | |
322 | out = b"".join(out) | |
323 | return out | |
324 | ||
325 | ||
326 | def file_size(url, session=None, size_policy="head", **kwargs): | |
327 | """Call HEAD on the server to get file size | |
328 | ||
329 | Default operation is to explicitly allow redirects and use encoding | |
330 | 'identity' (no compression) to get the true size of the target. | |
331 | """ | |
332 | kwargs = kwargs.copy() | |
333 | ar = kwargs.pop("allow_redirects", True) | |
334 | head = kwargs.get("headers", {}).copy() | |
335 | head["Accept-Encoding"] = "identity" | |
336 | session = session or requests.Session() | |
337 | if size_policy == "head": | |
338 | r = session.head(url, allow_redirects=ar, **kwargs) | |
339 | elif size_policy == "get": | |
340 | kwargs["stream"] = True | |
341 | r = session.get(url, allow_redirects=ar, **kwargs) | |
342 | else: | |
343 | raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy) | |
344 | if "Content-Length" in r.headers: | |
345 | return int(r.headers["Content-Length"]) | |
346 | elif "Content-Range" in r.headers: | |
347 | return int(r.headers["Content-Range"].split("/")[1]) | |
348 | ||
349 | ||
350 | class AllBytes(object): | |
351 | """Cache entire contents of a remote URL""" | |
352 | ||
353 | def __init__(self, data): | |
354 | self.data = data | |
355 | ||
356 | def _fetch(self, start, end): | |
357 | return self.data[start:end] |
0 | import io | |
1 | import os | |
2 | import shutil | |
3 | import posixpath | |
4 | import re | |
5 | import tempfile | |
6 | from fsspec import AbstractFileSystem | |
7 | from fsspec.utils import stringify_path | |
8 | ||
9 | ||
10 | class LocalFileSystem(AbstractFileSystem): | |
11 | """Interface to files on local storage | |
12 | ||
13 | Parameters | |
14 | ---------- | |
15 | auto_mkdirs: bool | |
16 | Whether, when opening a file, the directory containing it should | |
17 | be created (if it doesn't already exist). This is assumed by pyarrow | |
18 | code. | |
19 | """ | |
20 | ||
21 | root_marker = "/" | |
22 | ||
23 | def __init__(self, auto_mkdir=True, **kwargs): | |
24 | super().__init__(**kwargs) | |
25 | self.auto_mkdir = auto_mkdir | |
26 | ||
27 | def mkdir(self, path, create_parents=True, **kwargs): | |
28 | path = self._strip_protocol(path) | |
29 | if create_parents: | |
30 | self.makedirs(path, exist_ok=True) | |
31 | else: | |
32 | os.mkdir(path, **kwargs) | |
33 | ||
34 | def makedirs(self, path, exist_ok=False): | |
35 | path = self._strip_protocol(path) | |
36 | os.makedirs(path, exist_ok=exist_ok) | |
37 | ||
38 | def rmdir(self, path): | |
39 | os.rmdir(path) | |
40 | ||
41 | def ls(self, path, detail=False): | |
42 | path = self._strip_protocol(path) | |
43 | paths = [posixpath.join(path, f) for f in os.listdir(path)] | |
44 | if detail: | |
45 | return [self.info(f) for f in paths] | |
46 | else: | |
47 | return paths | |
48 | ||
49 | def glob(self, path, **kargs): | |
50 | path = self._strip_protocol(path) | |
51 | return super().glob(path) | |
52 | ||
53 | def info(self, path, **kwargs): | |
54 | path = self._strip_protocol(path) | |
55 | out = os.stat(path, follow_symlinks=False) | |
56 | dest = False | |
57 | if os.path.islink(path): | |
58 | t = "link" | |
59 | dest = os.readlink(path) | |
60 | elif os.path.isdir(path): | |
61 | t = "directory" | |
62 | elif os.path.isfile(path): | |
63 | t = "file" | |
64 | else: | |
65 | t = "other" | |
66 | result = {"name": path, "size": out.st_size, "type": t, "created": out.st_ctime} | |
67 | for field in ["mode", "uid", "gid", "mtime"]: | |
68 | result[field] = getattr(out, "st_" + field) | |
69 | if dest: | |
70 | result["destination"] = dest | |
71 | try: | |
72 | out2 = os.stat(path, follow_symlinks=True) | |
73 | result["size"] = out2.st_size | |
74 | except IOError: | |
75 | result["size"] = 0 | |
76 | return result | |
77 | ||
78 | def copy(self, path1, path2, **kwargs): | |
79 | shutil.copyfile(path1, path2) | |
80 | ||
81 | def get(self, path1, path2, **kwargs): | |
82 | if kwargs.get("recursive"): | |
83 | return super(LocalFileSystem, self).get(path1, path2, **kwargs) | |
84 | else: | |
85 | return self.copy(path1, path2, **kwargs) | |
86 | ||
87 | def put(self, path1, path2, **kwargs): | |
88 | if kwargs.get("recursive"): | |
89 | return super(LocalFileSystem, self).put(path1, path2, **kwargs) | |
90 | else: | |
91 | return self.copy(path1, path2, **kwargs) | |
92 | ||
93 | def mv(self, path1, path2, **kwargs): | |
94 | os.rename(path1, path2) | |
95 | ||
96 | def rm(self, path, recursive=False, maxdepth=None): | |
97 | if recursive and self.isdir(path): | |
98 | shutil.rmtree(path) | |
99 | else: | |
100 | os.remove(path) | |
101 | ||
102 | def _open(self, path, mode="rb", block_size=None, **kwargs): | |
103 | path = self._strip_protocol(path) | |
104 | if self.auto_mkdir: | |
105 | self.makedirs(self._parent(path), exist_ok=True) | |
106 | return LocalFileOpener(path, mode, fs=self, **kwargs) | |
107 | ||
108 | def touch(self, path, **kwargs): | |
109 | path = self._strip_protocol(path) | |
110 | if self.exists(path): | |
111 | os.utime(path, None) | |
112 | else: | |
113 | open(path, "a").close() | |
114 | ||
115 | @classmethod | |
116 | def _parent(cls, path): | |
117 | path = cls._strip_protocol(path).rstrip("/") | |
118 | if "/" in path: | |
119 | return path.rsplit("/", 1)[0] | |
120 | else: | |
121 | return cls.root_marker | |
122 | ||
123 | @classmethod | |
124 | def _strip_protocol(cls, path): | |
125 | path = stringify_path(path) | |
126 | if path.startswith("file://"): | |
127 | path = path[7:] | |
128 | return make_path_posix(path) | |
129 | ||
130 | ||
131 | def make_path_posix(path, sep=os.sep): | |
132 | """ Make path generic """ | |
133 | if re.match("/[A-Za-z]:", path): | |
134 | # for windows file URI like "file:///C:/folder/file" | |
135 | # or "file:///C:\\dir\\file" | |
136 | path = path[1:] | |
137 | if path.startswith("\\\\"): | |
138 | # special case for windows UNC/DFS-style paths, do nothing, | |
139 | # jsut flip the slashes around (case below does not work!) | |
140 | return path.replace("\\", "/") | |
141 | if path.startswith("\\") or re.match("[\\\\]*[A-Za-z]:", path): | |
142 | # windows full path "\\server\\path" or "C:\\local\\path" | |
143 | return path.lstrip("\\").replace("\\", "/").replace("//", "/") | |
144 | if ( | |
145 | sep not in path | |
146 | and "/" not in path | |
147 | or (sep == "/" and not path.startswith("/")) | |
148 | or (sep == "\\" and ":" not in path) | |
149 | ): | |
150 | # relative path like "path" or "rel\\path" (win) or rel/path" | |
151 | path = os.path.abspath(path) | |
152 | if os.sep == "\\": | |
153 | # abspath made some more '\\' separators | |
154 | return make_path_posix(path, sep) | |
155 | return path | |
156 | ||
157 | ||
158 | class LocalFileOpener(object): | |
159 | def __init__(self, path, mode, autocommit=True, fs=None, **kwargs): | |
160 | self.path = path | |
161 | self.mode = mode | |
162 | self.fs = fs | |
163 | self.f = None | |
164 | self.autocommit = autocommit | |
165 | self.blocksize = io.DEFAULT_BUFFER_SIZE | |
166 | self._open() | |
167 | ||
168 | def _open(self): | |
169 | if self.f is None or self.f.closed: | |
170 | if self.autocommit or "w" not in self.mode: | |
171 | self.f = open(self.path, mode=self.mode) | |
172 | else: | |
173 | # TODO: check if path is writable? | |
174 | i, name = tempfile.mkstemp() | |
175 | self.temp = name | |
176 | self.f = open(name, mode=self.mode) | |
177 | if "w" not in self.mode: | |
178 | self.details = self.fs.info(self.path) | |
179 | self.size = self.details["size"] | |
180 | self.f.size = self.size | |
181 | ||
182 | def _fetch_range(self, start, end): | |
183 | # probably only used by cached FS | |
184 | if "r" not in self.mode: | |
185 | raise ValueError | |
186 | self._open() | |
187 | self.f.seek(start) | |
188 | return self.f.read(end - start) | |
189 | ||
190 | def __setstate__(self, state): | |
191 | if "r" in state["mode"]: | |
192 | loc = self.state.pop("loc") | |
193 | self._open() | |
194 | self.f.seek(loc) | |
195 | else: | |
196 | self.f = None | |
197 | self.__dict__.update(state) | |
198 | ||
199 | def __getstate__(self): | |
200 | d = self.__dict__.copy() | |
201 | d.pop("f") | |
202 | if "r" in self.mode: | |
203 | d["loc"] = self.f.tell() | |
204 | else: | |
205 | if not self.f.closed: | |
206 | raise ValueError("Cannot serialise open write-mode local file") | |
207 | return d | |
208 | ||
209 | def commit(self): | |
210 | if self.autocommit: | |
211 | raise RuntimeError("Can only commit if not already set to autocommit") | |
212 | os.rename(self.temp, self.path) | |
213 | ||
214 | def discard(self): | |
215 | if self.autocommit: | |
216 | raise RuntimeError("Cannot discard if set to autocommit") | |
217 | os.remove(self.temp) | |
218 | ||
219 | def __fspath__(self): | |
220 | # uniquely for fsspec implementations, this is a real path | |
221 | return self.path | |
222 | ||
223 | def __getattr__(self, item): | |
224 | return getattr(self.f, item) | |
225 | ||
226 | def __enter__(self): | |
227 | self._incontext = True | |
228 | return self.f.__enter__() | |
229 | ||
230 | def __exit__(self, exc_type, exc_value, traceback): | |
231 | self._incontext = False | |
232 | self.f.__exit__(exc_type, exc_value, traceback) |
0 | from __future__ import print_function, division, absolute_import | |
1 | ||
2 | from io import BytesIO | |
3 | from fsspec import AbstractFileSystem | |
4 | import logging | |
5 | ||
6 | logger = logging.Logger("fsspec.memoryfs") | |
7 | ||
8 | ||
9 | class MemoryFileSystem(AbstractFileSystem): | |
10 | """A filesystem based on a dict of BytesIO objects""" | |
11 | ||
12 | store = {} # global | |
13 | pseudo_dirs = [] | |
14 | protocol = "memory" | |
15 | root_marker = "" | |
16 | ||
17 | def ls(self, path, detail=False): | |
18 | if path in self.store: | |
19 | # there is a key with this exact name, but could also be directory | |
20 | out = [ | |
21 | { | |
22 | "name": path, | |
23 | "size": self.store[path].getbuffer().nbytes, | |
24 | "type": "file", | |
25 | } | |
26 | ] | |
27 | else: | |
28 | out = [] | |
29 | path = path.strip("/").lstrip("/") | |
30 | paths = set() | |
31 | for p2 in self.store: | |
32 | has_slash = "/" if p2.startswith("/") else "" | |
33 | p = p2.lstrip("/") | |
34 | if "/" in p: | |
35 | root = p.rsplit("/", 1)[0] | |
36 | else: | |
37 | root = "" | |
38 | if root == path: | |
39 | out.append( | |
40 | { | |
41 | "name": has_slash + p, | |
42 | "size": self.store[p2].getbuffer().nbytes, | |
43 | "type": "file", | |
44 | } | |
45 | ) | |
46 | elif path and all( | |
47 | (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) | |
48 | ): | |
49 | # implicit directory | |
50 | ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) | |
51 | if ppath not in paths: | |
52 | out.append( | |
53 | { | |
54 | "name": has_slash + ppath + "/", | |
55 | "size": 0, | |
56 | "type": "directory", | |
57 | } | |
58 | ) | |
59 | paths.add(ppath) | |
60 | elif all( | |
61 | (a == b) | |
62 | for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) | |
63 | ): | |
64 | # root directory entry | |
65 | ppath = p.rstrip("/").split("/", 1)[0] | |
66 | if ppath not in paths: | |
67 | out.append( | |
68 | { | |
69 | "name": has_slash + ppath + "/", | |
70 | "size": 0, | |
71 | "type": "directory", | |
72 | } | |
73 | ) | |
74 | paths.add(ppath) | |
75 | for p2 in self.pseudo_dirs: | |
76 | if self._parent(p2).strip("/").rstrip("/") == path: | |
77 | out.append({"name": p2 + "/", "size": 0, "type": "directory"}) | |
78 | if detail: | |
79 | return out | |
80 | return sorted([f["name"] for f in out]) | |
81 | ||
82 | def mkdir(self, path): | |
83 | path = path.rstrip("/") | |
84 | if path not in self.pseudo_dirs: | |
85 | self.pseudo_dirs.append(path) | |
86 | ||
87 | def rmdir(self, path): | |
88 | path = path.rstrip("/") | |
89 | if path in self.pseudo_dirs: | |
90 | if self.ls(path) == []: | |
91 | self.pseudo_dirs.remove(path) | |
92 | else: | |
93 | raise OSError("Directory %s not empty" % path) | |
94 | else: | |
95 | raise FileNotFoundError(path) | |
96 | ||
97 | def exists(self, path): | |
98 | return path in self.store | |
99 | ||
100 | def _open(self, path, mode="rb", **kwargs): | |
101 | """Make a file-like object | |
102 | ||
103 | Parameters | |
104 | ---------- | |
105 | path: str | |
106 | identifier | |
107 | mode: str | |
108 | normally "rb", "wb" or "ab" | |
109 | """ | |
110 | if mode in ["rb", "ab", "rb+"]: | |
111 | if path in self.store: | |
112 | f = self.store[path] | |
113 | if mode == "rb": | |
114 | f.seek(0) | |
115 | else: | |
116 | f.seek(0, 2) | |
117 | return f | |
118 | else: | |
119 | raise FileNotFoundError(path) | |
120 | if mode == "wb": | |
121 | m = MemoryFile(self, path) | |
122 | if not self._intrans: | |
123 | m.commit() | |
124 | return m | |
125 | ||
126 | def copy(self, path1, path2, **kwargs): | |
127 | self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer()) | |
128 | ||
129 | def cat(self, path): | |
130 | return self.store[path].getvalue() | |
131 | ||
132 | def _rm(self, path): | |
133 | del self.store[path] | |
134 | ||
135 | def size(self, path): | |
136 | """Size in bytes of the file at path""" | |
137 | if path not in self.store: | |
138 | raise FileNotFoundError(path) | |
139 | return self.store[path].getbuffer().nbytes | |
140 | ||
141 | ||
142 | class MemoryFile(BytesIO): | |
143 | """A BytesIO which can't close and works as a context manager | |
144 | ||
145 | Can initialise with data | |
146 | ||
147 | No need to provide fs, path if auto-committing (default) | |
148 | """ | |
149 | ||
150 | def __init__(self, fs, path, data=None): | |
151 | self.fs = fs | |
152 | self.path = path | |
153 | if data: | |
154 | self.write(data) | |
155 | self.size = len(data) | |
156 | self.seek(0) | |
157 | ||
158 | def __enter__(self): | |
159 | return self | |
160 | ||
161 | def close(self): | |
162 | self.size = self.seek(0, 2) | |
163 | ||
164 | def discard(self): | |
165 | pass | |
166 | ||
167 | def commit(self): | |
168 | self.fs.store[self.path] = self |
0 | import paramiko | |
1 | from stat import S_ISDIR, S_ISLNK | |
2 | import types | |
3 | import uuid | |
4 | from .. import AbstractFileSystem | |
5 | from ..utils import infer_storage_options | |
6 | ||
7 | ||
8 | class SFTPFileSystem(AbstractFileSystem): | |
9 | """Files over SFTP/SSH | |
10 | ||
11 | Peer-to-peer filesystem over SSH using paramiko. | |
12 | """ | |
13 | ||
14 | protocol = "sftp", "ssh" | |
15 | ||
16 | def __init__(self, host, **ssh_kwargs): | |
17 | """ | |
18 | ||
19 | Parameters | |
20 | ---------- | |
21 | host: str | |
22 | Hostname or IP as a string | |
23 | temppath: str | |
24 | Location on the server to put files, when within a transaction | |
25 | ssh_kwargs: dict | |
26 | Parameters passed on to connection. See details in | |
27 | http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect | |
28 | May include port, username, password... | |
29 | """ | |
30 | if self._cached: | |
31 | return | |
32 | super(SFTPFileSystem, self).__init__(**ssh_kwargs) | |
33 | self.temppath = ssh_kwargs.pop("temppath", "/tmp") | |
34 | self.host = host | |
35 | self.ssh_kwargs = ssh_kwargs | |
36 | self._connect() | |
37 | ||
38 | def _connect(self): | |
39 | self.client = paramiko.SSHClient() | |
40 | self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) | |
41 | self.client.connect(self.host, **self.ssh_kwargs) | |
42 | self.ftp = self.client.open_sftp() | |
43 | ||
44 | @classmethod | |
45 | def _strip_protocol(cls, path): | |
46 | return infer_storage_options(path)["path"] | |
47 | ||
48 | @staticmethod | |
49 | def _get_kwargs_from_urls(urlpath): | |
50 | out = infer_storage_options(urlpath) | |
51 | out.pop("path", None) | |
52 | out.pop("protocol", None) | |
53 | return out | |
54 | ||
55 | def mkdir(self, path, mode=511): | |
56 | self.ftp.mkdir(path, mode) | |
57 | ||
58 | def makedirs(self, path, exist_ok=False, mode=511): | |
59 | if self.exists(path) and not exist_ok: | |
60 | raise FileExistsError("File exists: {}".format(path)) | |
61 | ||
62 | parts = path.split("/") | |
63 | path = "" | |
64 | ||
65 | for part in parts: | |
66 | path += "/" + part | |
67 | if not self.exists(path): | |
68 | self.mkdir(path, mode) | |
69 | ||
70 | def rmdir(self, path): | |
71 | self.ftp.rmdir(path) | |
72 | ||
73 | def info(self, path): | |
74 | s = self.ftp.stat(path) | |
75 | if S_ISDIR(s.st_mode): | |
76 | t = "directory" | |
77 | elif S_ISLNK(s.st_mode): | |
78 | t = "link" | |
79 | else: | |
80 | t = "file" | |
81 | return { | |
82 | "name": path + "/" if t == "directory" else path, | |
83 | "size": s.st_size, | |
84 | "type": t, | |
85 | "uid": s.st_uid, | |
86 | "gui": s.st_gid, | |
87 | "time": s.st_atime, | |
88 | "mtime": s.st_mtime, | |
89 | } | |
90 | ||
91 | def ls(self, path, detail=False): | |
92 | out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)] | |
93 | out = [self.info(o) for o in out] | |
94 | if detail: | |
95 | return out | |
96 | return sorted([p["name"] for p in out]) | |
97 | ||
98 | def put(self, lpath, rpath): | |
99 | self.ftp.put(lpath, rpath) | |
100 | ||
101 | def get(self, rpath, lpath): | |
102 | self.ftp.get(rpath, lpath) | |
103 | ||
104 | def _open(self, path, mode="rb", block_size=None, **kwargs): | |
105 | """ | |
106 | block_size: int or None | |
107 | If 0, no buffering, if 1, line buffering, if >1, buffer that many | |
108 | bytes, if None use default from paramiko. | |
109 | """ | |
110 | if kwargs.get("autocommit", True) is False: | |
111 | # writes to temporary file, move on commit | |
112 | path2 = "{}/{}".format(self.temppath, uuid.uuid4()) | |
113 | f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) | |
114 | f.temppath = path2 | |
115 | f.targetpath = path | |
116 | f.fs = self | |
117 | f.commit = types.MethodType(commit_a_file, f) | |
118 | f.discard = types.MethodType(discard_a_file, f) | |
119 | else: | |
120 | f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) | |
121 | return f | |
122 | ||
123 | def _rm(self, path): | |
124 | if self.isdir(path): | |
125 | self.ftp.rmdir(path) | |
126 | else: | |
127 | self.ftp.remove(path) | |
128 | ||
129 | def mv(self, old, new): | |
130 | self.ftp.posix_rename(old, new) | |
131 | ||
132 | ||
133 | def commit_a_file(self): | |
134 | self.fs.mv(self.temppath, self.targetpath) | |
135 | ||
136 | ||
137 | def discard_a_file(self): | |
138 | self.fs._rm(self.temppath) |
0 | import os | |
1 | import shutil | |
2 | import pickle | |
3 | import pytest | |
4 | ||
5 | import fsspec | |
6 | from fsspec.implementations.cached import CachingFileSystem | |
7 | from .test_ftp import FTPFileSystem | |
8 | ||
9 | ||
10 | @pytest.fixture | |
11 | def local_filecache(): | |
12 | import tempfile | |
13 | ||
14 | original_location = tempfile.mkdtemp() | |
15 | cache_location = tempfile.mkdtemp() | |
16 | original_file = os.path.join(original_location, "afile") | |
17 | data = b"test data" | |
18 | with open(original_file, "wb") as f: | |
19 | f.write(data) | |
20 | ||
21 | # we can access the file and read it | |
22 | fs = fsspec.filesystem( | |
23 | "filecache", target_protocol="file", cache_storage=cache_location | |
24 | ) | |
25 | ||
26 | return (data, original_file, cache_location, fs) | |
27 | ||
28 | ||
29 | def test_idempotent(): | |
30 | fs = CachingFileSystem("file") | |
31 | fs2 = CachingFileSystem("file") | |
32 | assert fs2 is fs | |
33 | fs3 = pickle.loads(pickle.dumps(fs)) | |
34 | assert fs3.storage == fs.storage | |
35 | ||
36 | ||
37 | def test_workflow(ftp_writable): | |
38 | host, port, user, pw = ftp_writable | |
39 | fs = FTPFileSystem(host, port, user, pw) | |
40 | with fs.open("/out", "wb") as f: | |
41 | f.write(b"test") | |
42 | fs = fsspec.filesystem( | |
43 | "cached", | |
44 | target_protocol="ftp", | |
45 | target_options={"host": host, "port": port, "username": user, "password": pw}, | |
46 | ) | |
47 | assert os.listdir(fs.storage[-1]) == [] | |
48 | with fs.open("/out") as f: | |
49 | assert os.listdir(fs.storage[-1]) | |
50 | assert f.read() == b"test" | |
51 | assert fs.cached_files[-1]["ftp:///out"]["blocks"] | |
52 | assert fs.cat("/out") == b"test" | |
53 | assert fs.cached_files[-1]["ftp:///out"]["blocks"] is True | |
54 | ||
55 | with fs.open("/out", "wb") as f: | |
56 | f.write(b"changed") | |
57 | ||
58 | assert fs.cat("/out") == b"test" # old value | |
59 | ||
60 | ||
61 | def test_blocksize(ftp_writable): | |
62 | host, port, user, pw = ftp_writable | |
63 | fs = FTPFileSystem(host, port, user, pw) | |
64 | with fs.open("/out_block", "wb") as f: | |
65 | f.write(b"test" * 4000) | |
66 | ||
67 | fs = fsspec.filesystem( | |
68 | "blockcache", | |
69 | target_protocol="ftp", | |
70 | target_options={"host": host, "port": port, "username": user, "password": pw}, | |
71 | ) | |
72 | ||
73 | with fs.open("/out_block", block_size=20) as f: | |
74 | assert f.read(1) == b"t" | |
75 | with pytest.raises(ValueError): | |
76 | fs.open("/out_block", block_size=30) | |
77 | ||
78 | ||
79 | def test_local_filecache_creates_dir_if_needed(): | |
80 | import tempfile | |
81 | ||
82 | original_location = tempfile.mkdtemp() | |
83 | cache_location = "foofoobarbar" | |
84 | assert not os.path.exists(cache_location) | |
85 | ||
86 | try: | |
87 | original_file = os.path.join(original_location, "afile") | |
88 | data = b"test data" | |
89 | with open(original_file, "wb") as f: | |
90 | f.write(data) | |
91 | ||
92 | # we can access the file and read it | |
93 | fs = fsspec.filesystem( | |
94 | "filecache", target_protocol="file", cache_storage=cache_location | |
95 | ) | |
96 | ||
97 | with fs.open(original_file, "rb") as f: | |
98 | data_in_cache = f.read() | |
99 | ||
100 | assert os.path.exists(cache_location) | |
101 | ||
102 | finally: | |
103 | shutil.rmtree(cache_location) | |
104 | ||
105 | assert data_in_cache == data | |
106 | ||
107 | ||
108 | def test_local_filecache_basic(local_filecache): | |
109 | data, original_file, cache_location, fs = local_filecache | |
110 | ||
111 | # reading from the file contains the right data | |
112 | with fs.open(original_file, "rb") as f: | |
113 | assert f.read() == data | |
114 | assert "cache" in os.listdir(cache_location) | |
115 | ||
116 | # the file in the location contains the right data | |
117 | fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value | |
118 | assert fn in os.listdir(cache_location) | |
119 | with open(os.path.join(cache_location, fn), "rb") as f: | |
120 | assert f.read() == data | |
121 | ||
122 | # still there when original file is removed (check=False) | |
123 | os.remove(original_file) | |
124 | with fs.open(original_file, "rb") as f: | |
125 | assert f.read() == data | |
126 | ||
127 | ||
128 | def test_local_filecache_does_not_change_when_original_data_changed(local_filecache): | |
129 | old_data, original_file, cache_location, fs = local_filecache | |
130 | new_data = b"abc" | |
131 | ||
132 | with fs.open(original_file, "rb") as f: | |
133 | assert f.read() == old_data | |
134 | ||
135 | with open(original_file, "wb") as f: | |
136 | f.write(new_data) | |
137 | ||
138 | with fs.open(original_file, "rb") as f: | |
139 | assert f.read() == old_data | |
140 | ||
141 | ||
142 | def test_local_filecache_gets_from_original_if_cache_deleted(local_filecache): | |
143 | old_data, original_file, cache_location, fs = local_filecache | |
144 | new_data = b"abc" | |
145 | ||
146 | with fs.open(original_file, "rb") as f: | |
147 | assert f.read() == old_data | |
148 | ||
149 | with open(original_file, "wb") as f: | |
150 | f.write(new_data) | |
151 | ||
152 | shutil.rmtree(cache_location) | |
153 | assert os.path.exists(original_file) | |
154 | ||
155 | with open(original_file, "rb") as f: | |
156 | assert f.read() == new_data | |
157 | ||
158 | with fs.open(original_file, "rb") as f: | |
159 | assert f.read() == new_data | |
160 | ||
161 | # the file in the location contains the right data | |
162 | fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value | |
163 | assert fn in os.listdir(cache_location) | |
164 | with open(os.path.join(cache_location, fn), "rb") as f: | |
165 | assert f.read() == new_data | |
166 | ||
167 | ||
168 | def test_local_filecache_with_new_cache_location_makes_a_new_copy(local_filecache): | |
169 | import tempfile | |
170 | ||
171 | data, original_file, old_cache_location, old_fs = local_filecache | |
172 | new_cache_location = tempfile.mkdtemp() | |
173 | ||
174 | with old_fs.open(original_file, "rb") as f: | |
175 | assert f.read() == data | |
176 | ||
177 | new_fs = fsspec.filesystem( | |
178 | "filecache", target_protocol="file", cache_storage=new_cache_location | |
179 | ) | |
180 | ||
181 | with new_fs.open(original_file, "rb") as f: | |
182 | assert f.read() == data | |
183 | ||
184 | # the file in the location contains the right data | |
185 | fn = list(new_fs.cached_files[-1].values())[0]["fn"] # this is a hash value | |
186 | assert fn in os.listdir(old_cache_location) | |
187 | assert fn in os.listdir(new_cache_location) | |
188 | ||
189 | with open(os.path.join(new_cache_location, fn), "rb") as f: | |
190 | assert f.read() == data | |
191 | ||
192 | ||
193 | def test_filecache_multicache(): | |
194 | import tempfile | |
195 | ||
196 | origin = tempfile.mkdtemp() | |
197 | cache1 = tempfile.mkdtemp() | |
198 | cache2 = tempfile.mkdtemp() | |
199 | data = b"test data" | |
200 | f1 = os.path.join(origin, "afile") | |
201 | f2 = os.path.join(origin, "bfile") | |
202 | with open(f1, "wb") as f: | |
203 | f.write(data) | |
204 | with open(f2, "wb") as f: | |
205 | f.write(data * 2) | |
206 | ||
207 | # populates first cache | |
208 | fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1) | |
209 | assert fs.cat(f1) == data | |
210 | ||
211 | assert len(os.listdir(cache1)) == 2 # cache and hashed afile | |
212 | assert len(os.listdir(cache2)) == 0 # hasn't been intialized yet | |
213 | ||
214 | # populates last cache if file not found in first cache | |
215 | fs = fsspec.filesystem( | |
216 | "filecache", target_protocol="file", cache_storage=[cache1, cache2] | |
217 | ) | |
218 | ||
219 | assert fs.cat(f1) == data | |
220 | assert fs.cat(f2) == data * 2 | |
221 | ||
222 | assert "cache" in os.listdir(cache1) | |
223 | assert "cache" in os.listdir(cache2) | |
224 | ||
225 | cache1_contents = [f for f in os.listdir(cache1) if f != "cache"] | |
226 | assert len(cache1_contents) == 1 | |
227 | ||
228 | with open(os.path.join(cache1, cache1_contents[0]), "rb") as f: | |
229 | assert f.read() == data | |
230 | ||
231 | cache2_contents = [f for f in os.listdir(cache2) if f != "cache"] | |
232 | assert len(cache2_contents) == 1 | |
233 | ||
234 | with open(os.path.join(cache2, cache2_contents[0]), "rb") as f: | |
235 | assert f.read() == data * 2 | |
236 | ||
237 | ||
238 | def test_filecache_multicache_with_same_file_different_data_reads_from_first(): | |
239 | import tempfile | |
240 | ||
241 | origin = tempfile.mkdtemp() | |
242 | cache1 = tempfile.mkdtemp() | |
243 | cache2 = tempfile.mkdtemp() | |
244 | data = b"test data" | |
245 | f1 = os.path.join(origin, "afile") | |
246 | with open(f1, "wb") as f: | |
247 | f.write(data) | |
248 | ||
249 | # populate first cache | |
250 | fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1) | |
251 | assert fs.cat(f1) == data | |
252 | ||
253 | with open(f1, "wb") as f: | |
254 | f.write(data * 2) | |
255 | ||
256 | # populate second cache | |
257 | fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache2) | |
258 | ||
259 | assert fs.cat(f1) == data * 2 | |
260 | ||
261 | # the filenames in each cache are the same, but the data is different | |
262 | assert os.listdir(cache1) == os.listdir(cache2) | |
263 | ||
264 | fs = fsspec.filesystem( | |
265 | "filecache", target_protocol="file", cache_storage=[cache1, cache2] | |
266 | ) | |
267 | ||
268 | assert fs.cat(f1) == data | |
269 | ||
270 | ||
271 | def test_filecache_with_checks(): | |
272 | import tempfile | |
273 | import time | |
274 | ||
275 | origin = tempfile.mkdtemp() | |
276 | cache1 = tempfile.mkdtemp() | |
277 | data = b"test data" | |
278 | f1 = os.path.join(origin, "afile") | |
279 | with open(f1, "wb") as f: | |
280 | f.write(data) | |
281 | ||
282 | # populate first cache | |
283 | fs = fsspec.filesystem( | |
284 | "filecache", target_protocol="file", cache_storage=cache1, expiry_time=0.1 | |
285 | ) | |
286 | fs2 = fsspec.filesystem( | |
287 | "filecache", target_protocol="file", cache_storage=cache1, check_files=True | |
288 | ) | |
289 | assert fs.cat(f1) == data | |
290 | assert fs2.cat(f1) == data | |
291 | ||
292 | with open(f1, "wb") as f: | |
293 | f.write(data * 2) | |
294 | ||
295 | assert fs.cat(f1) == data # does not change | |
296 | assert fs2.cat(f1) == data * 2 # changed, since origin changed | |
297 | time.sleep(0.11) # allow cache details to expire | |
298 | assert fs.cat(f1) == data * 2 # changed, since origin changed | |
299 | ||
300 | ||
301 | def test_takes_fs_instance(): | |
302 | import tempfile | |
303 | ||
304 | origin = tempfile.mkdtemp() | |
305 | data = b"test data" | |
306 | f1 = os.path.join(origin, "afile") | |
307 | with open(f1, "wb") as f: | |
308 | f.write(data) | |
309 | ||
310 | fs = fsspec.filesystem("file") | |
311 | fs2 = fsspec.filesystem("filecache", target_protocol=fs) | |
312 | ||
313 | assert fs2.cat(f1) == data |
0 | import pytest | |
1 | import fsspec | |
2 | ||
3 | pytest.importorskip("distributed") | |
4 | ||
5 | ||
6 | @pytest.fixture() | |
7 | def cli(tmpdir): | |
8 | import dask.distributed | |
9 | ||
10 | client = dask.distributed.Client(n_workers=1) | |
11 | ||
12 | def setup(): | |
13 | m = fsspec.filesystem("memory") | |
14 | with m.open("afile", "wb") as f: | |
15 | f.write(b"data") | |
16 | ||
17 | client.run(setup) | |
18 | try: | |
19 | yield client | |
20 | finally: | |
21 | client.close() | |
22 | ||
23 | ||
24 | def test_basic(cli): | |
25 | ||
26 | fs = fsspec.filesystem("dask", remote_protocol="memory") | |
27 | assert fs.ls("") == ["afile"] | |
28 | assert fs.cat("afile") == b"data" |
0 | import os | |
1 | import pytest | |
2 | import subprocess | |
3 | import sys | |
4 | import time | |
5 | ||
6 | from fsspec.implementations.ftp import FTPFileSystem | |
7 | from fsspec import open_files | |
8 | import fsspec | |
9 | ||
10 | here = os.path.dirname(os.path.abspath(__file__)) | |
11 | ||
12 | ||
13 | @pytest.fixture() | |
14 | def ftp(): | |
15 | P = subprocess.Popen( | |
16 | [sys.executable, "-m", "pyftpdlib", "-d", here], | |
17 | stderr=subprocess.STDOUT, | |
18 | stdout=subprocess.PIPE, | |
19 | ) | |
20 | try: | |
21 | time.sleep(1) | |
22 | yield "localhost", 2121 | |
23 | finally: | |
24 | P.terminate() | |
25 | P.wait() | |
26 | ||
27 | ||
28 | def test_basic(ftp): | |
29 | host, port = ftp | |
30 | fs = FTPFileSystem(host, port) | |
31 | assert fs.ls("/", detail=False) == sorted(os.listdir(here)) | |
32 | out = fs.cat("/" + os.path.basename(__file__)) | |
33 | assert out == open(__file__, "rb").read() | |
34 | ||
35 | ||
36 | def test_not_cached(ftp): | |
37 | host, port = ftp | |
38 | fs = FTPFileSystem(host, port) | |
39 | fs2 = FTPFileSystem(host, port) | |
40 | assert fs is not fs2 | |
41 | ||
42 | ||
43 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) | |
44 | def test_complex(ftp_writable, cache_type): | |
45 | from fsspec.core import BytesCache | |
46 | ||
47 | host, port, user, pw = ftp_writable | |
48 | files = open_files( | |
49 | "ftp:///ou*", | |
50 | host=host, | |
51 | port=port, | |
52 | username=user, | |
53 | password=pw, | |
54 | block_size=10000, | |
55 | cache_type=cache_type, | |
56 | ) | |
57 | assert len(files) == 1 | |
58 | with files[0] as fo: | |
59 | assert fo.read(10) == b"hellohello" | |
60 | if isinstance(fo.cache, BytesCache): | |
61 | assert len(fo.cache.cache) == 10010 | |
62 | assert fo.read(2) == b"he" | |
63 | assert fo.tell() == 12 | |
64 | ||
65 | ||
66 | def test_write_small(ftp_writable): | |
67 | host, port, user, pw = ftp_writable | |
68 | fs = FTPFileSystem(host, port, user, pw) | |
69 | with fs.open("/out2", "wb") as f: | |
70 | f.write(b"oi") | |
71 | assert fs.cat("/out2") == b"oi" | |
72 | ||
73 | ||
74 | def test_with_url(ftp_writable): | |
75 | host, port, user, pw = ftp_writable | |
76 | fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "wb") | |
77 | with fo as f: | |
78 | f.write(b"hello") | |
79 | fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "rb") | |
80 | with fo as f: | |
81 | assert f.read() == b"hello" | |
82 | ||
83 | ||
84 | @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) | |
85 | def test_write_big(ftp_writable, cache_type): | |
86 | host, port, user, pw = ftp_writable | |
87 | fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type) | |
88 | fn = "/bigger" | |
89 | with fs.open(fn, "wb") as f: | |
90 | f.write(b"o" * 500) | |
91 | assert not fs.exists(fn) | |
92 | f.write(b"o" * 1000) | |
93 | fs.invalidate_cache() | |
94 | assert fs.exists(fn) | |
95 | f.write(b"o" * 200) | |
96 | f.flush() | |
97 | ||
98 | assert fs.info(fn)["size"] == 1700 | |
99 | assert fs.cat(fn) == b"o" * 1700 | |
100 | ||
101 | ||
102 | def test_transaction(ftp_writable): | |
103 | host, port, user, pw = ftp_writable | |
104 | fs = FTPFileSystem(host, port, user, pw) | |
105 | fs.mkdir("/tmp") | |
106 | fn = "/tr" | |
107 | with fs.transaction: | |
108 | with fs.open(fn, "wb") as f: | |
109 | f.write(b"not") | |
110 | assert not fs.exists(fn) | |
111 | assert fs.exists(fn) | |
112 | assert fs.cat(fn) == b"not" | |
113 | ||
114 | fs.rm(fn) | |
115 | assert not fs.exists(fn) |
0 | import pytest | |
1 | from http.server import BaseHTTPRequestHandler, HTTPServer | |
2 | import threading | |
3 | import fsspec | |
4 | ||
5 | requests = pytest.importorskip("requests") | |
6 | port = 9898 | |
7 | data = b"\n".join([b"some test data"] * 1000) | |
8 | realfile = "http://localhost:%i/index/realfile" % port | |
9 | index = b'<a href="%s">Link</a>' % realfile.encode() | |
10 | ||
11 | ||
12 | class HTTPTestHandler(BaseHTTPRequestHandler): | |
13 | def _respond(self, code=200, headers=None, data=b""): | |
14 | headers = headers or {} | |
15 | headers.update({"User-Agent": "test"}) | |
16 | self.send_response(code) | |
17 | for k, v in headers.items(): | |
18 | self.send_header(k, str(v)) | |
19 | self.end_headers() | |
20 | if data: | |
21 | self.wfile.write(data) | |
22 | ||
23 | def do_GET(self): | |
24 | if self.path not in ["/index/realfile", "/index"]: | |
25 | self._respond(404) | |
26 | return | |
27 | ||
28 | d = data if self.path == "/index/realfile" else index | |
29 | if "Range" in self.headers: | |
30 | ran = self.headers["Range"] | |
31 | b, ran = ran.split("=") | |
32 | start, end = ran.split("-") | |
33 | print(start) | |
34 | print(end) | |
35 | d = d[int(start) : int(end) + 1] | |
36 | if "give_length" in self.headers: | |
37 | response_headers = {"Content-Length": len(d)} | |
38 | self._respond(200, response_headers, d) | |
39 | elif "give_range" in self.headers: | |
40 | self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))}, d) | |
41 | else: | |
42 | self._respond(200, data=d) | |
43 | ||
44 | def do_HEAD(self): | |
45 | if "head_ok" not in self.headers: | |
46 | self._respond(405) | |
47 | return | |
48 | d = data if self.path == "/index/realfile" else index | |
49 | if self.path not in ["/index/realfile", "/index"]: | |
50 | self._respond(404) | |
51 | elif "give_length" in self.headers: | |
52 | response_headers = {"Content-Length": len(d)} | |
53 | if "zero_length" in self.headers: | |
54 | response_headers["Content-Length"] = 0 | |
55 | ||
56 | self._respond(200, response_headers) | |
57 | elif "give_range" in self.headers: | |
58 | self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))}) | |
59 | else: | |
60 | self._respond(200) # OK response, but no useful info | |
61 | ||
62 | ||
63 | @pytest.fixture(scope="module") | |
64 | def server(): | |
65 | server_address = ("", port) | |
66 | httpd = HTTPServer(server_address, HTTPTestHandler) | |
67 | th = threading.Thread(target=httpd.serve_forever) | |
68 | th.daemon = True | |
69 | th.start() | |
70 | try: | |
71 | yield "http://localhost:%i" % port | |
72 | finally: | |
73 | httpd.socket.close() | |
74 | httpd.shutdown() | |
75 | th.join() | |
76 | ||
77 | ||
78 | def test_list(server): | |
79 | h = fsspec.filesystem("http") | |
80 | out = h.glob(server + "/index/*") | |
81 | assert out == [server + "/index/realfile"] | |
82 | ||
83 | ||
84 | def test_policy_arg(server): | |
85 | h = fsspec.filesystem("http", size_policy="get") | |
86 | out = h.glob(server + "/index/*") | |
87 | assert out == [server + "/index/realfile"] | |
88 | ||
89 | ||
90 | def test_exists(server): | |
91 | h = fsspec.filesystem("http") | |
92 | assert not h.exists(server + "/notafile") | |
93 | ||
94 | ||
95 | def test_read(server): | |
96 | h = fsspec.filesystem("http") | |
97 | out = server + "/index/realfile" | |
98 | with h.open(out, "rb") as f: | |
99 | assert f.read() == data | |
100 | with h.open(out, "rb", block_size=0) as f: | |
101 | assert f.read() == data | |
102 | with h.open(out, "rb") as f: | |
103 | assert f.read(100) + f.read() == data | |
104 | ||
105 | ||
106 | def test_methods(server): | |
107 | h = fsspec.filesystem("http") | |
108 | url = server + "/index/realfile" | |
109 | assert h.exists(url) | |
110 | assert h.cat(url) == data | |
111 | ||
112 | ||
113 | @pytest.mark.parametrize( | |
114 | "headers", | |
115 | [ | |
116 | {}, | |
117 | {"give_length": "true"}, | |
118 | {"give_length": "true", "head_ok": "true"}, | |
119 | {"give_range": "true"}, | |
120 | ], | |
121 | ) | |
122 | def test_random_access(server, headers): | |
123 | h = fsspec.filesystem("http", headers=headers) | |
124 | url = server + "/index/realfile" | |
125 | with h.open(url, "rb") as f: | |
126 | if headers: | |
127 | assert f.size == len(data) | |
128 | assert f.read(5) == data[:5] | |
129 | # python server does not respect bytes range request | |
130 | # we actually get all the data | |
131 | f.seek(5, 1) | |
132 | assert f.read(5) == data[10:15] | |
133 | ||
134 | ||
135 | def test_mapper_url(server): | |
136 | h = fsspec.filesystem("http") | |
137 | mapper = h.get_mapper(server + "/index/") | |
138 | assert mapper.root.startswith("http:") | |
139 | assert list(mapper) | |
140 | ||
141 | mapper2 = fsspec.get_mapper(server + "/index/") | |
142 | assert mapper2.root.startswith("http:") | |
143 | assert list(mapper) == list(mapper2) | |
144 | ||
145 | ||
146 | def test_content_length_zero(server): | |
147 | h = fsspec.filesystem( | |
148 | "http", headers={"give_length": "true", "zero_length": "true"} | |
149 | ) | |
150 | url = server + "/index/realfile" | |
151 | ||
152 | with h.open(url, "rb") as f: | |
153 | assert f.read() == data |
0 | from __future__ import print_function, division, absolute_import | |
1 | ||
2 | import gzip | |
3 | import os | |
4 | import os.path | |
5 | import sys | |
6 | from contextlib import contextmanager | |
7 | import tempfile | |
8 | ||
9 | import pytest | |
10 | import fsspec | |
11 | from fsspec.core import open_files, get_fs_token_paths, OpenFile | |
12 | from fsspec.implementations.local import LocalFileSystem, make_path_posix | |
13 | from fsspec import compression | |
14 | ||
15 | files = { | |
16 | ".test.accounts.1.json": ( | |
17 | b'{"amount": 100, "name": "Alice"}\n' | |
18 | b'{"amount": 200, "name": "Bob"}\n' | |
19 | b'{"amount": 300, "name": "Charlie"}\n' | |
20 | b'{"amount": 400, "name": "Dennis"}\n' | |
21 | ), | |
22 | ".test.accounts.2.json": ( | |
23 | b'{"amount": 500, "name": "Alice"}\n' | |
24 | b'{"amount": 600, "name": "Bob"}\n' | |
25 | b'{"amount": 700, "name": "Charlie"}\n' | |
26 | b'{"amount": 800, "name": "Dennis"}\n' | |
27 | ), | |
28 | } | |
29 | ||
30 | ||
31 | csv_files = { | |
32 | ".test.fakedata.1.csv": (b"a,b\n" b"1,2\n"), | |
33 | ".test.fakedata.2.csv": (b"a,b\n" b"3,4\n"), | |
34 | } | |
35 | ||
36 | ||
37 | @contextmanager | |
38 | def filetexts(d, open=open, mode="t"): | |
39 | """ Dumps a number of textfiles to disk | |
40 | ||
41 | d - dict | |
42 | a mapping from filename to text like {'a.csv': '1,1\n2,2'} | |
43 | ||
44 | Since this is meant for use in tests, this context manager will | |
45 | automatically switch to a temporary current directory, to avoid | |
46 | race conditions when running tests in parallel. | |
47 | """ | |
48 | odir = os.getcwd() | |
49 | dirname = tempfile.mkdtemp() | |
50 | try: | |
51 | os.chdir(dirname) | |
52 | for filename, text in d.items(): | |
53 | f = open(filename, "w" + mode) | |
54 | try: | |
55 | f.write(text) | |
56 | finally: | |
57 | try: | |
58 | f.close() | |
59 | except AttributeError: | |
60 | pass | |
61 | ||
62 | yield list(d) | |
63 | ||
64 | for filename in d: | |
65 | if os.path.exists(filename): | |
66 | try: | |
67 | os.remove(filename) | |
68 | except (IOError, OSError): | |
69 | pass | |
70 | finally: | |
71 | os.chdir(odir) | |
72 | ||
73 | ||
74 | def test_urlpath_inference_strips_protocol(tmpdir): | |
75 | tmpdir = str(tmpdir) | |
76 | paths = [os.path.join(tmpdir, "test.%02d.csv" % i) for i in range(20)] | |
77 | ||
78 | for path in paths: | |
79 | with open(path, "wb") as f: | |
80 | f.write(b"1,2,3\n" * 10) | |
81 | ||
82 | # globstring | |
83 | protocol = "file:///" if sys.platform == "win32" else "file://" | |
84 | urlpath = protocol + os.path.join(tmpdir, "test.*.csv") | |
85 | _, _, paths2 = get_fs_token_paths(urlpath) | |
86 | assert paths2 == paths | |
87 | ||
88 | # list of paths | |
89 | _, _, paths2 = get_fs_token_paths([protocol + p for p in paths]) | |
90 | assert paths2 == paths | |
91 | ||
92 | ||
93 | def test_urlpath_inference_errors(): | |
94 | # Empty list | |
95 | with pytest.raises(ValueError) as err: | |
96 | get_fs_token_paths([]) | |
97 | assert "empty" in str(err.value) | |
98 | ||
99 | # Protocols differ | |
100 | with pytest.raises(ValueError) as err: | |
101 | get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"]) | |
102 | assert "same protocol" in str(err.value) | |
103 | ||
104 | # Unknown type | |
105 | with pytest.raises(TypeError): | |
106 | get_fs_token_paths( | |
107 | {"sets/are.csv", "unordered/so/they.csv", "should/not/be.csvallowed.csv"} | |
108 | ) | |
109 | ||
110 | ||
111 | def test_urlpath_expand_read(): | |
112 | """Make sure * is expanded in file paths when reading.""" | |
113 | # when reading, globs should be expanded to read files by mask | |
114 | with filetexts(csv_files, mode="b"): | |
115 | _, _, paths = get_fs_token_paths("./.*.csv") | |
116 | assert len(paths) == 2 | |
117 | _, _, paths = get_fs_token_paths(["./.*.csv"]) | |
118 | assert len(paths) == 2 | |
119 | ||
120 | ||
121 | def test_urlpath_expand_write(): | |
122 | """Make sure * is expanded in file paths when writing.""" | |
123 | _, _, paths = get_fs_token_paths("prefix-*.csv", mode="wb", num=2) | |
124 | assert all( | |
125 | [p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])] | |
126 | ) | |
127 | _, _, paths = get_fs_token_paths(["prefix-*.csv"], mode="wb", num=2) | |
128 | assert all( | |
129 | [p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])] | |
130 | ) | |
131 | # we can read with multiple masks, but not write | |
132 | with pytest.raises(ValueError): | |
133 | _, _, paths = get_fs_token_paths( | |
134 | ["prefix1-*.csv", "prefix2-*.csv"], mode="wb", num=2 | |
135 | ) | |
136 | ||
137 | ||
138 | def test_open_files(): | |
139 | with filetexts(files, mode="b"): | |
140 | myfiles = open_files("./.test.accounts.*") | |
141 | assert len(myfiles) == len(files) | |
142 | for lazy_file, data_file in zip(myfiles, sorted(files)): | |
143 | with lazy_file as f: | |
144 | x = f.read() | |
145 | assert x == files[data_file] | |
146 | ||
147 | ||
148 | @pytest.mark.parametrize("encoding", ["utf-8", "ascii"]) | |
149 | def test_open_files_text_mode(encoding): | |
150 | with filetexts(files, mode="b"): | |
151 | myfiles = open_files("./.test.accounts.*", mode="rt", encoding=encoding) | |
152 | assert len(myfiles) == len(files) | |
153 | data = [] | |
154 | for file in myfiles: | |
155 | with file as f: | |
156 | data.append(f.read()) | |
157 | assert list(data) == [files[k].decode(encoding) for k in sorted(files)] | |
158 | ||
159 | ||
160 | @pytest.mark.parametrize("mode", ["rt", "rb"]) | |
161 | @pytest.mark.parametrize("fmt", list(compression.compr)) | |
162 | def test_compressions(fmt, mode, tmpdir): | |
163 | if fmt == "zip" and sys.version_info < (3, 6): | |
164 | pytest.xfail("zip compression requires python3.6 or higher") | |
165 | ||
166 | tmpdir = str(tmpdir) | |
167 | fn = os.path.join(tmpdir, ".tmp.getsize") | |
168 | fs = LocalFileSystem() | |
169 | f = OpenFile(fs, fn, compression=fmt, mode="wb") | |
170 | data = b"Long line of readily compressible text" | |
171 | with f as fo: | |
172 | fo.write(data) | |
173 | if fmt is None: | |
174 | assert fs.size(fn) == len(data) | |
175 | else: | |
176 | assert fs.size(fn) != len(data) | |
177 | ||
178 | f = OpenFile(fs, fn, compression=fmt, mode=mode) | |
179 | with f as fo: | |
180 | if mode == "rb": | |
181 | assert fo.read() == data | |
182 | else: | |
183 | assert fo.read() == data.decode() | |
184 | ||
185 | ||
186 | def test_bad_compression(): | |
187 | with filetexts(files, mode="b"): | |
188 | for func in [open_files]: | |
189 | with pytest.raises(ValueError): | |
190 | func("./.test.accounts.*", compression="not-found") | |
191 | ||
192 | ||
193 | def test_not_found(): | |
194 | fn = "not-a-file" | |
195 | fs = LocalFileSystem() | |
196 | with pytest.raises((FileNotFoundError, OSError)): | |
197 | with OpenFile(fs, fn, mode="rb"): | |
198 | pass | |
199 | ||
200 | ||
201 | def test_isfile(): | |
202 | fs = LocalFileSystem() | |
203 | with filetexts(files, mode="b"): | |
204 | for f in files.keys(): | |
205 | assert fs.isfile(f) | |
206 | assert not fs.isfile("not-a-file") | |
207 | ||
208 | ||
209 | def test_isdir(): | |
210 | fs = LocalFileSystem() | |
211 | with filetexts(files, mode="b"): | |
212 | for f in files.keys(): | |
213 | assert fs.isdir(os.path.dirname(os.path.abspath(f))) | |
214 | assert not fs.isdir(f) | |
215 | assert not fs.isdir("not-a-dir") | |
216 | ||
217 | ||
218 | @pytest.mark.parametrize("compression_opener", [(None, open), ("gzip", gzip.open)]) | |
219 | def test_open_files_write(tmpdir, compression_opener): | |
220 | tmpdir = str(tmpdir) | |
221 | compression, opener = compression_opener | |
222 | fn = str(tmpdir) + "/*.part" | |
223 | files = open_files(fn, num=2, mode="wb", compression=compression) | |
224 | assert len(files) == 2 | |
225 | assert {f.mode for f in files} == {"wb"} | |
226 | for fil in files: | |
227 | with fil as f: | |
228 | f.write(b"000") | |
229 | files = sorted(os.listdir(tmpdir)) | |
230 | assert files == ["0.part", "1.part"] | |
231 | ||
232 | with opener(os.path.join(tmpdir, files[0]), "rb") as f: | |
233 | d = f.read() | |
234 | assert d == b"000" | |
235 | ||
236 | ||
237 | def test_pickability_of_lazy_files(tmpdir): | |
238 | tmpdir = str(tmpdir) | |
239 | cloudpickle = pytest.importorskip("cloudpickle") | |
240 | ||
241 | with filetexts(files, mode="b"): | |
242 | myfiles = open_files("./.test.accounts.*") | |
243 | myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles)) | |
244 | ||
245 | for f, f2 in zip(myfiles, myfiles2): | |
246 | assert f.path == f2.path | |
247 | assert isinstance(f.fs, type(f2.fs)) | |
248 | with f as f_open, f2 as f2_open: | |
249 | assert f_open.read() == f2_open.read() | |
250 | ||
251 | ||
252 | def test_abs_paths(tmpdir): | |
253 | tmpdir = str(tmpdir) | |
254 | here = os.getcwd() | |
255 | os.chdir(tmpdir) | |
256 | with open("tmp", "w") as f: | |
257 | f.write("hi") | |
258 | out = LocalFileSystem().glob("./*") | |
259 | assert len(out) == 1 | |
260 | assert os.sep in out[0] | |
261 | assert "tmp" in out[0] | |
262 | ||
263 | # I don't know what this was testing - but should avoid local paths anyway | |
264 | # fs = LocalFileSystem() | |
265 | os.chdir(here) | |
266 | # with fs.open('tmp', 'r') as f: | |
267 | # res = f.read() | |
268 | # assert res == 'hi' | |
269 | ||
270 | ||
271 | @pytest.mark.parametrize("sep", ["/", "\\"]) | |
272 | @pytest.mark.parametrize("chars", ["+", "++", "(", ")", "|", "\\"]) | |
273 | def test_glob_weird_characters(tmpdir, sep, chars): | |
274 | tmpdir = str(tmpdir) | |
275 | ||
276 | subdir = tmpdir + sep + "test" + chars + "x" | |
277 | os.mkdir(subdir) | |
278 | with open(subdir + sep + "tmp", "w") as f: | |
279 | f.write("hi") | |
280 | ||
281 | out = LocalFileSystem().glob(subdir + sep + "*") | |
282 | assert len(out) == 1 | |
283 | assert os.sep in out[0] | |
284 | assert "tmp" in out[0] | |
285 | ||
286 | ||
287 | def test_globfind_dirs(tmpdir): | |
288 | tmpdir = str(tmpdir) | |
289 | fs = fsspec.filesystem("file") | |
290 | fs.mkdir(tmpdir + "/dir") | |
291 | fs.touch(tmpdir + "/dir/afile") | |
292 | assert [tmpdir + "/dir"] == fs.glob(tmpdir + "/*") | |
293 | assert [tmpdir + "/dir/afile"] == fs.find(tmpdir) | |
294 | assert [tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(tmpdir, withdirs=True) | |
295 | ||
296 | ||
297 | def test_get_pyarrow_filesystem(): | |
298 | pa = pytest.importorskip("pyarrow") | |
299 | ||
300 | fs = LocalFileSystem() | |
301 | assert isinstance(fs, pa.filesystem.FileSystem) | |
302 | assert fs._get_pyarrow_filesystem() is fs | |
303 | ||
304 | class UnknownFileSystem(object): | |
305 | pass | |
306 | ||
307 | assert not isinstance(UnknownFileSystem(), pa.filesystem.FileSystem) | |
308 | ||
309 | ||
310 | def test_directories(tmpdir): | |
311 | tmpdir = str(tmpdir) | |
312 | fs = LocalFileSystem() | |
313 | fs.mkdir(tmpdir + "/dir") | |
314 | assert tmpdir + "/dir" in fs.ls(tmpdir) | |
315 | assert fs.ls(tmpdir, True)[0]["type"] == "directory" | |
316 | fs.rmdir(tmpdir + "/dir") | |
317 | assert not fs.ls(tmpdir) | |
318 | ||
319 | ||
320 | def test_file_ops(tmpdir): | |
321 | tmpdir = str(tmpdir) | |
322 | fs = LocalFileSystem() | |
323 | with pytest.raises(FileNotFoundError): | |
324 | fs.info(tmpdir + "/nofile") | |
325 | fs.touch(tmpdir + "/afile") | |
326 | i1 = fs.ukey(tmpdir + "/afile") | |
327 | ||
328 | assert tmpdir + "/afile" in fs.ls(tmpdir) | |
329 | ||
330 | with fs.open(tmpdir + "/afile", "wb") as f: | |
331 | f.write(b"data") | |
332 | i2 = fs.ukey(tmpdir + "/afile") | |
333 | assert i1 != i2 # because file changed | |
334 | ||
335 | fs.copy(tmpdir + "/afile", tmpdir + "/afile2") | |
336 | assert tmpdir + "/afile2" in fs.ls(tmpdir) | |
337 | ||
338 | fs.move(tmpdir + "/afile", tmpdir + "/afile3") | |
339 | assert not fs.exists(tmpdir + "/afile") | |
340 | ||
341 | fs.rm(tmpdir + "/afile3", recursive=True) | |
342 | assert not fs.exists(tmpdir + "/afile3") | |
343 | ||
344 | fs.rm(tmpdir, recursive=True) | |
345 | assert not fs.exists(tmpdir) | |
346 | ||
347 | ||
348 | def test_recursive_get_put(tmpdir): | |
349 | tmpdir = str(tmpdir) | |
350 | fs = LocalFileSystem() | |
351 | ||
352 | fs.mkdir(tmpdir + "/a1/a2/a3") | |
353 | fs.touch(tmpdir + "/a1/a2/a3/afile") | |
354 | fs.touch(tmpdir + "/a1/afile") | |
355 | ||
356 | fs.get("file://{0}/a1".format(tmpdir), tmpdir + "/b1", recursive=True) | |
357 | assert fs.isfile(tmpdir + "/b1/afile") | |
358 | assert fs.isfile(tmpdir + "/b1/a2/a3/afile") | |
359 | ||
360 | fs.put(tmpdir + "/b1", "file://{0}/c1".format(tmpdir), recursive=True) | |
361 | assert fs.isfile(tmpdir + "/c1/afile") | |
362 | assert fs.isfile(tmpdir + "/c1/a2/a3/afile") | |
363 | ||
364 | ||
365 | def test_commit_discard(tmpdir): | |
366 | tmpdir = str(tmpdir) | |
367 | fs = LocalFileSystem() | |
368 | with fs.transaction: | |
369 | with fs.open(tmpdir + "/afile", "wb") as f: | |
370 | assert not fs.exists(tmpdir + "/afile") | |
371 | f.write(b"data") | |
372 | assert not fs.exists(tmpdir + "/afile") | |
373 | ||
374 | assert fs._transaction is None | |
375 | assert fs.cat(tmpdir + "/afile") == b"data" | |
376 | ||
377 | try: | |
378 | with fs.transaction: | |
379 | with fs.open(tmpdir + "/bfile", "wb") as f: | |
380 | f.write(b"data") | |
381 | raise KeyboardInterrupt | |
382 | except KeyboardInterrupt: | |
383 | assert not fs.exists(tmpdir + "/bfile") | |
384 | ||
385 | ||
386 | def test_make_path_posix(): | |
387 | cwd = os.getcwd() | |
388 | assert make_path_posix("/a/posix/path") == "/a/posix/path" | |
389 | assert make_path_posix("/posix") == "/posix" | |
390 | assert make_path_posix("relpath", sep="/") == os.path.join(cwd, "relpath") | |
391 | assert make_path_posix("rel/path", sep="/") == os.path.join(cwd, "rel/path") | |
392 | assert make_path_posix("C:\\path", sep="\\") == "C:/path" | |
393 | assert ( | |
394 | make_path_posix( | |
395 | "\\\\windows-server\\someshare\\path\\more\\path\\dir\\foo.parquet" | |
396 | ) | |
397 | == "//windows-server/someshare/path/more/path/dir/foo.parquet" | |
398 | ) | |
399 | assert "/" in make_path_posix("rel\\path", sep="\\") | |
400 | ||
401 | ||
402 | def test_links(tmpdir): | |
403 | tmpdir = str(tmpdir) | |
404 | fn0 = os.path.join(tmpdir, "target") | |
405 | fn1 = os.path.join(tmpdir, "link1") | |
406 | fn2 = os.path.join(tmpdir, "link2") | |
407 | data = b"my target data" | |
408 | with open(fn0, "wb") as f: | |
409 | f.write(data) | |
410 | os.symlink(fn0, fn1) | |
411 | os.symlink(fn0, fn2) | |
412 | ||
413 | fs = LocalFileSystem() | |
414 | assert fs.info(fn0)["type"] == "file" | |
415 | assert fs.info(fn1)["type"] == "link" | |
416 | assert fs.info(fn2)["type"] == "link" | |
417 | ||
418 | assert fs.info(fn0)["size"] == len(data) | |
419 | assert fs.info(fn1)["size"] == len(data) | |
420 | assert fs.info(fn2)["size"] == len(data) | |
421 | ||
422 | of = fsspec.open(fn1, "rb") | |
423 | with of as f: | |
424 | assert f.read() == data | |
425 | ||
426 | of = fsspec.open(fn2, "rb") | |
427 | with of as f: | |
428 | assert f.read() == data |
0 | import pytest | |
1 | import sys | |
2 | ||
3 | ||
4 | def test_1(m): | |
5 | m.touch("/somefile") # NB: is found with or without initial / | |
6 | m.touch("afiles/and/anothers") | |
7 | assert m.find("") == ["afiles/and/anothers", "somefile"] | |
8 | assert list(m.get_mapper("")) == ["afiles/and/anothers", "somefile"] | |
9 | ||
10 | ||
11 | @pytest.mark.xfail( | |
12 | sys.version_info < (3, 6), | |
13 | reason="py35 error, see https://github.com/intake/filesystem_spec/issues/148", | |
14 | ) | |
15 | def test_ls(m): | |
16 | m.touch("/dir/afile") | |
17 | m.touch("/dir/dir1/bfile") | |
18 | m.touch("/dir/dir1/cfile") | |
19 | ||
20 | assert m.ls("/", False) == ["/dir/"] | |
21 | assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1/"] | |
22 | assert m.ls("/dir", True)[0]["type"] == "file" | |
23 | assert m.ls("/dir", True)[1]["type"] == "directory" | |
24 | ||
25 | assert len(m.ls("/dir/dir1")) == 2 |
0 | import pytest | |
1 | import shlex | |
2 | import subprocess | |
3 | import time | |
4 | import fsspec | |
5 | ||
6 | pytest.importorskip("paramiko") | |
7 | ||
8 | ||
9 | def stop_docker(name): | |
10 | cmd = shlex.split('docker ps -a -q --filter "name=%s"' % name) | |
11 | cid = subprocess.check_output(cmd).strip().decode() | |
12 | if cid: | |
13 | subprocess.call(["docker", "rm", "-f", cid]) | |
14 | ||
15 | ||
16 | @pytest.fixture(scope="module") | |
17 | def ssh(): | |
18 | try: | |
19 | subprocess.check_call(["docker", "run", "hello-world"]) | |
20 | except subprocess.CalledProcessError: | |
21 | pytest.skip("docker run not available") | |
22 | return | |
23 | ||
24 | # requires docker | |
25 | cmds = [ | |
26 | r"apt-get update", | |
27 | r"apt-get install -y openssh-server", | |
28 | r"mkdir /var/run/sshd", | |
29 | "bash -c \"echo 'root:pass' | chpasswd\"", | |
30 | ( | |
31 | r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' " | |
32 | r"/etc/ssh/sshd_config" | |
33 | ), | |
34 | ( | |
35 | r"sed 's@session\s*required\s*pam_loginuid.so@session optional " | |
36 | r"pam_loginuid.so@g' -i /etc/pam.d/sshd" | |
37 | ), | |
38 | r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"', | |
39 | r"/usr/sbin/sshd", | |
40 | ] | |
41 | name = "fsspec_sftp" | |
42 | stop_docker(name) | |
43 | cmd = "docker run -d -p 9200:22 --name {} ubuntu:16.04 sleep 9000".format(name) | |
44 | cid = subprocess.check_output(shlex.split(cmd)).strip().decode() | |
45 | for cmd in cmds: | |
46 | subprocess.call(["docker", "exec", cid] + shlex.split(cmd)) | |
47 | try: | |
48 | time.sleep(1) | |
49 | yield dict(host="localhost", port=9200, username="root", password="pass") | |
50 | finally: | |
51 | stop_docker(name) | |
52 | ||
53 | ||
54 | def test_simple(ssh): | |
55 | f = fsspec.get_filesystem_class("sftp")(**ssh) | |
56 | f.mkdirs("/home/someuser/deeper") | |
57 | f.touch("/home/someuser/deeper/afile") | |
58 | assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"] | |
59 | assert f.ls("/home/someuser/deeper/") == ["/home/someuser/deeper/afile"] | |
60 | assert f.info("/home/someuser/deeper/afile")["type"] == "file" | |
61 | assert f.info("/home/someuser/deeper/afile")["size"] == 0 | |
62 | assert f.exists("/home/someuser") | |
63 | f.rm("/home/someuser", recursive=True) | |
64 | assert not f.exists("/home/someuser") | |
65 | ||
66 | ||
67 | @pytest.mark.parametrize("protocol", ["sftp", "ssh"]) | |
68 | def test_with_url(protocol, ssh): | |
69 | fo = fsspec.open( | |
70 | protocol + "://{username}:{password}@{host}:{port}" | |
71 | "/home/someuserout".format(**ssh), | |
72 | "wb", | |
73 | ) | |
74 | with fo as f: | |
75 | f.write(b"hello") | |
76 | fo = fsspec.open( | |
77 | protocol + "://{username}:{password}@{host}:{port}" | |
78 | "/home/someuserout".format(**ssh), | |
79 | "rb", | |
80 | ) | |
81 | with fo as f: | |
82 | assert f.read() == b"hello" | |
83 | ||
84 | ||
85 | def test_transaction(ssh): | |
86 | f = fsspec.get_filesystem_class("sftp")(**ssh) | |
87 | f.mkdirs("/home/someuser/deeper") | |
88 | f.start_transaction() | |
89 | f.touch("/home/someuser/deeper/afile") | |
90 | assert f.find("/home/someuser") == [] | |
91 | f.end_transaction() | |
92 | f.find("/home/someuser") == ["/home/someuser/deeper/afile"] | |
93 | ||
94 | with f.transaction: | |
95 | assert f._intrans | |
96 | f.touch("/home/someuser/deeper/afile2") | |
97 | assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"] | |
98 | assert f.find("/home/someuser") == [ | |
99 | "/home/someuser/deeper/afile", | |
100 | "/home/someuser/deeper/afile2", | |
101 | ] | |
102 | ||
103 | ||
104 | def test_makedirs_exist_ok(ssh): | |
105 | f = fsspec.get_filesystem_class("sftp")(**ssh) | |
106 | ||
107 | f.makedirs("/a/b/c") | |
108 | ||
109 | with pytest.raises(FileExistsError, match="/a/b/c"): | |
110 | f.makedirs("/a/b/c", exist_ok=False) | |
111 | ||
112 | f.makedirs("/a/b/c", exist_ok=True) |
0 | import pickle | |
1 | import pytest | |
2 | import subprocess | |
3 | import time | |
4 | import fsspec | |
5 | ||
6 | requests = pytest.importorskip("requests") | |
7 | ||
8 | from fsspec.implementations.webhdfs import WebHDFS # noqa: E402 | |
9 | ||
10 | ||
11 | @pytest.fixture(scope="module") | |
12 | def hdfs_cluster(): | |
13 | cmd0 = "htcluster shutdown".split() | |
14 | try: | |
15 | subprocess.check_output(cmd0, stderr=subprocess.STDOUT) | |
16 | except FileNotFoundError: | |
17 | pytest.skip("htcluster not found") | |
18 | except subprocess.CalledProcessError as ex: | |
19 | pytest.skip("htcluster failed: " + ex.output.decode()) | |
20 | cmd1 = "htcluster startup --image base".split() | |
21 | subprocess.check_output(cmd1) | |
22 | try: | |
23 | while True: | |
24 | t = 90 | |
25 | try: | |
26 | requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS") | |
27 | except: # noqa: E722 | |
28 | t -= 1 | |
29 | assert t > 0, "Timeout waiting for HDFS" | |
30 | time.sleep(1) | |
31 | continue | |
32 | break | |
33 | time.sleep(7) | |
34 | yield "localhost" | |
35 | finally: | |
36 | subprocess.check_output(cmd0) | |
37 | ||
38 | ||
39 | def test_pickle(hdfs_cluster): | |
40 | w = WebHDFS(hdfs_cluster, user="testuser") | |
41 | w2 = pickle.loads(pickle.dumps(w)) | |
42 | assert w == w2 | |
43 | ||
44 | ||
45 | def test_simple(hdfs_cluster): | |
46 | w = WebHDFS(hdfs_cluster, user="testuser") | |
47 | home = w.home_directory() | |
48 | assert home == "/user/testuser" | |
49 | with pytest.raises(PermissionError): | |
50 | w.mkdir("/root") | |
51 | ||
52 | ||
53 | def test_url(hdfs_cluster): | |
54 | url = "webhdfs://testuser@localhost:50070/user/testuser/myfile" | |
55 | fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"}) | |
56 | with fo as f: | |
57 | f.write(b"hello") | |
58 | fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"}) | |
59 | with fo as f: | |
60 | assert f.read() == b"hello" | |
61 | ||
62 | ||
63 | def test_workflow(hdfs_cluster): | |
64 | w = WebHDFS( | |
65 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} | |
66 | ) | |
67 | fn = "/user/testuser/testrun/afile" | |
68 | w.mkdir("/user/testuser/testrun") | |
69 | with w.open(fn, "wb") as f: | |
70 | f.write(b"hello") | |
71 | assert w.exists(fn) | |
72 | info = w.info(fn) | |
73 | assert info["size"] == 5 | |
74 | assert w.isfile(fn) | |
75 | assert w.cat(fn) == b"hello" | |
76 | w.rm("/user/testuser/testrun", recursive=True) | |
77 | assert not w.exists(fn) | |
78 | ||
79 | ||
80 | def test_with_gzip(hdfs_cluster): | |
81 | from gzip import GzipFile | |
82 | ||
83 | w = WebHDFS( | |
84 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} | |
85 | ) | |
86 | fn = "/user/testuser/gzfile" | |
87 | with w.open(fn, "wb") as f: | |
88 | gf = GzipFile(fileobj=f, mode="w") | |
89 | gf.write(b"hello") | |
90 | gf.close() | |
91 | with w.open(fn, "rb") as f: | |
92 | gf = GzipFile(fileobj=f, mode="r") | |
93 | assert gf.read() == b"hello" | |
94 | ||
95 | ||
96 | def test_workflow_transaction(hdfs_cluster): | |
97 | w = WebHDFS( | |
98 | hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} | |
99 | ) | |
100 | fn = "/user/testuser/testrun/afile" | |
101 | w.mkdirs("/user/testuser/testrun") | |
102 | with w.transaction: | |
103 | with w.open(fn, "wb") as f: | |
104 | f.write(b"hello") | |
105 | assert not w.exists(fn) | |
106 | assert w.exists(fn) | |
107 | assert w.ukey(fn) | |
108 | files = w.ls("/user/testuser/testrun", True) | |
109 | summ = w.content_summary("/user/testuser/testrun") | |
110 | assert summ["length"] == files[0]["size"] | |
111 | assert summ["fileCount"] == 1 | |
112 | ||
113 | w.rm("/user/testuser/testrun", recursive=True) | |
114 | assert not w.exists(fn) |
0 | import zipfile | |
1 | from contextlib import contextmanager | |
2 | import os | |
3 | import pickle | |
4 | import pytest | |
5 | import sys | |
6 | import tempfile | |
7 | import fsspec | |
8 | ||
9 | ||
10 | @contextmanager | |
11 | def tempzip(data={}): | |
12 | f = tempfile.mkstemp(suffix="zip")[1] | |
13 | with zipfile.ZipFile(f, mode="w") as z: | |
14 | for k, v in data.items(): | |
15 | z.writestr(k, v) | |
16 | try: | |
17 | yield f | |
18 | finally: | |
19 | try: | |
20 | os.remove(f) | |
21 | except (IOError, OSError): | |
22 | pass | |
23 | ||
24 | ||
25 | data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"} | |
26 | ||
27 | ||
28 | def test_empty(): | |
29 | with tempzip() as z: | |
30 | fs = fsspec.get_filesystem_class("zip")(fo=z) | |
31 | assert fs.find("") == [] | |
32 | ||
33 | ||
34 | @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip-info odd on py35") | |
35 | def test_mapping(): | |
36 | with tempzip(data) as z: | |
37 | fs = fsspec.get_filesystem_class("zip")(fo=z) | |
38 | m = fs.get_mapper("") | |
39 | assert list(m) == ["a", "b", "deeply/nested/path"] | |
40 | assert m["b"] == data["b"] | |
41 | ||
42 | ||
43 | @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip not supported on py35") | |
44 | def test_pickle(): | |
45 | with tempzip(data) as z: | |
46 | fs = fsspec.get_filesystem_class("zip")(fo=z) | |
47 | fs2 = pickle.loads(pickle.dumps(fs)) | |
48 | assert fs2.cat("b") == b"hello" |
0 | # https://hadoop.apache.org/docs/r1.0.4/webhdfs.html | |
1 | ||
2 | import requests | |
3 | from urllib.parse import quote | |
4 | import uuid | |
5 | from ..spec import AbstractFileSystem, AbstractBufferedFile | |
6 | from ..utils import infer_storage_options | |
7 | import logging | |
8 | ||
9 | logger = logging.getLogger("webhdfs") | |
10 | ||
11 | ||
12 | class WebHDFS(AbstractFileSystem): | |
13 | """ | |
14 | Interface to HDFS over HTTP | |
15 | ||
16 | Three auth mechanisms are supported: | |
17 | ||
18 | insecure: no auth is done, and the user is assumed to be whoever they | |
19 | say they are (parameter `user`), or a predefined value such as | |
20 | "dr.who" if not given | |
21 | spnego: when kerberos authentication is enabled, auth is negotiated by | |
22 | requests_kerberos https://github.com/requests/requests-kerberos . | |
23 | This establishes a session based on existing kinit login and/or | |
24 | specified principal/password; paraneters are passed with ``kerb_kwargs`` | |
25 | token: uses an existing Hadoop delegation token from another secured | |
26 | service. Indeed, this client can also generate such tokens when | |
27 | not insecure. Note that tokens expire, but can be renewed (by a | |
28 | previously specified user) and may allow for proxying. | |
29 | ||
30 | """ | |
31 | ||
32 | tempdir = "/tmp" | |
33 | protocol = "webhdfs", "webHDFS" | |
34 | ||
35 | def __init__( | |
36 | self, | |
37 | host, | |
38 | port=50070, | |
39 | kerberos=False, | |
40 | token=None, | |
41 | user=None, | |
42 | proxy_to=None, | |
43 | kerb_kwargs=None, | |
44 | data_proxy=None, | |
45 | **kwargs | |
46 | ): | |
47 | """ | |
48 | Parameters | |
49 | ---------- | |
50 | host: str | |
51 | Name-node address | |
52 | port: int | |
53 | Port for webHDFS | |
54 | kerberos: bool | |
55 | Whether to authenticate with kerberos for this connection | |
56 | token: str or None | |
57 | If given, use this token on every call to authenticate. A user | |
58 | and user-proxy may be encoded in the token and should not be also | |
59 | given | |
60 | user: str or None | |
61 | If given, assert the user name to connect with | |
62 | proxy_to: str or None | |
63 | If given, the user has the authority to proxy, and this value is | |
64 | the user in who's name actions are taken | |
65 | kerb_kwargs: dict | |
66 | Any extra arguments for HTTPKerberosAuth, see | |
67 | https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py | |
68 | data_proxy: dict, callable or None | |
69 | If given, map data-node addresses. This can be necessary if the | |
70 | HDFS cluster is behind a proxy, running on Docker or otherwise has | |
71 | a mismatch between the host-names given by the name-node and the | |
72 | address by which to refer to them from the client. If a dict, | |
73 | maps host names `host->data_proxy[host]`; if a callable, full | |
74 | URLs are passed, and function must conform to | |
75 | `url->data_proxy(url)`. | |
76 | kwargs | |
77 | """ | |
78 | if self._cached: | |
79 | return | |
80 | super().__init__(**kwargs) | |
81 | self.url = "http://{host}:{port}/webhdfs/v1".format(host=host, port=port) | |
82 | self.kerb = kerberos | |
83 | self.kerb_kwargs = kerb_kwargs or {} | |
84 | self.pars = {} | |
85 | self.proxy = data_proxy or {} | |
86 | if token is not None: | |
87 | if user is not None or proxy_to is not None: | |
88 | raise ValueError( | |
89 | "If passing a delegation token, must not set " | |
90 | "user or proxy_to, as these are encoded in the" | |
91 | " token" | |
92 | ) | |
93 | self.pars["delegation"] = token | |
94 | if user is not None: | |
95 | self.pars["user.name"] = user | |
96 | if proxy_to is not None: | |
97 | self.pars["doas"] = proxy_to | |
98 | if kerberos and user is not None: | |
99 | raise ValueError( | |
100 | "If using Kerberos auth, do not specify the " | |
101 | "user, this is handled by kinit." | |
102 | ) | |
103 | self._connect() | |
104 | ||
105 | def _connect(self): | |
106 | self.session = requests.Session() | |
107 | if self.kerb: | |
108 | from requests_kerberos import HTTPKerberosAuth | |
109 | ||
110 | self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs) | |
111 | ||
112 | def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs): | |
113 | url = self.url + quote(path or "") | |
114 | args = kwargs.copy() | |
115 | args.update(self.pars) | |
116 | args["op"] = op.upper() | |
117 | logger.debug(url, method, args) | |
118 | out = self.session.request( | |
119 | method=method.upper(), | |
120 | url=url, | |
121 | params=args, | |
122 | data=data, | |
123 | allow_redirects=redirect, | |
124 | ) | |
125 | if out.status_code == 404: | |
126 | raise FileNotFoundError(path) | |
127 | if out.status_code == 403: | |
128 | raise PermissionError(path or "") | |
129 | if out.status_code == 401: | |
130 | raise PermissionError # not specific to path | |
131 | out.raise_for_status() | |
132 | return out | |
133 | ||
134 | def _open( | |
135 | self, | |
136 | path, | |
137 | mode="rb", | |
138 | block_size=None, | |
139 | autocommit=True, | |
140 | replication=None, | |
141 | permissions=None, | |
142 | **kwargs | |
143 | ): | |
144 | """ | |
145 | ||
146 | Parameters | |
147 | ---------- | |
148 | path: str | |
149 | File location | |
150 | mode: str | |
151 | 'rb', 'wb', etc. | |
152 | block_size: int | |
153 | Client buffer size for read-ahead or write buffer | |
154 | autocommit: bool | |
155 | If False, writes to temporary file that only gets put in final | |
156 | location upon commit | |
157 | replication: int | |
158 | Number of copies of file on the cluster, write mode only | |
159 | permissions: str or int | |
160 | posix permissions, write mode only | |
161 | kwargs | |
162 | ||
163 | Returns | |
164 | ------- | |
165 | WebHDFile instance | |
166 | """ | |
167 | block_size = block_size or self.blocksize | |
168 | return WebHDFile( | |
169 | self, | |
170 | path, | |
171 | mode=mode, | |
172 | block_size=block_size, | |
173 | tempdir=self.tempdir, | |
174 | autocommit=autocommit, | |
175 | replication=replication, | |
176 | permissions=permissions, | |
177 | ) | |
178 | ||
179 | @staticmethod | |
180 | def _process_info(info): | |
181 | info["type"] = info["type"].lower() | |
182 | info["size"] = info["length"] | |
183 | return info | |
184 | ||
185 | @classmethod | |
186 | def _strip_protocol(cls, path): | |
187 | return infer_storage_options(path)["path"] | |
188 | ||
189 | @staticmethod | |
190 | def _get_kwargs_from_urls(urlpath): | |
191 | out = infer_storage_options(urlpath) | |
192 | out.pop("path", None) | |
193 | out.pop("protocol", None) | |
194 | if "username" in out: | |
195 | out["user"] = out.pop("username") | |
196 | return out | |
197 | ||
198 | def info(self, path): | |
199 | out = self._call("GETFILESTATUS", path=path) | |
200 | info = out.json()["FileStatus"] | |
201 | info["name"] = path | |
202 | return self._process_info(info) | |
203 | ||
204 | def ls(self, path, detail=False): | |
205 | out = self._call("LISTSTATUS", path=path) | |
206 | infos = out.json()["FileStatuses"]["FileStatus"] | |
207 | for info in infos: | |
208 | self._process_info(info) | |
209 | info["name"] = path.rstrip("/") + "/" + info["pathSuffix"] | |
210 | if detail: | |
211 | return sorted(infos, key=lambda i: i["name"]) | |
212 | else: | |
213 | return sorted(info["name"] for info in infos) | |
214 | ||
215 | def content_summary(self, path): | |
216 | """Total numbers of files, directories and bytes under path""" | |
217 | out = self._call("GETCONTENTSUMMARY", path=path) | |
218 | return out.json()["ContentSummary"] | |
219 | ||
220 | def ukey(self, path): | |
221 | """Checksum info of file, giving method and result""" | |
222 | out = self._call("GETFILECHECKSUM", path=path, redirect=False) | |
223 | location = self._apply_proxy(out.headers["Location"]) | |
224 | out2 = self.session.get(location) | |
225 | out2.raise_for_status() | |
226 | return out2.json()["FileChecksum"] | |
227 | ||
228 | def home_directory(self): | |
229 | """Get user's home directory""" | |
230 | out = self._call("GETHOMEDIRECTORY") | |
231 | return out.json()["Path"] | |
232 | ||
233 | def get_delegation_token(self, renewer=None): | |
234 | """Retrieve token which can give the same authority to other uses | |
235 | ||
236 | Parameters | |
237 | ---------- | |
238 | renewer: str or None | |
239 | User who may use this token; if None, will be current user | |
240 | """ | |
241 | if renewer: | |
242 | out = self._call("GETDELEGATIONTOKEN", renewer=renewer) | |
243 | else: | |
244 | out = self._call("GETDELEGATIONTOKEN") | |
245 | t = out.json()["Token"] | |
246 | if t is None: | |
247 | raise ValueError("No token available for this user/security context") | |
248 | return t["urlString"] | |
249 | ||
250 | def renew_delegation_token(self, token): | |
251 | """Make token live longer. Returns new expiry time""" | |
252 | out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token) | |
253 | return out.json()["long"] | |
254 | ||
255 | def cancel_delegation_token(self, token): | |
256 | """Stop the token from being useful""" | |
257 | self._call("CANCELDELEGATIONTOKEN", method="put", token=token) | |
258 | ||
259 | def chmod(self, path, mod): | |
260 | """Set the permission at path | |
261 | ||
262 | Parameters | |
263 | ---------- | |
264 | path: str | |
265 | location to set (file or directory) | |
266 | mod: str or int | |
267 | posix epresentation or permission, give as oct string, e.g, '777' | |
268 | or 0o777 | |
269 | """ | |
270 | self._call("SETPERMISSION", method="put", path=path, permission=mod) | |
271 | ||
272 | def chown(self, path, owner=None, group=None): | |
273 | """Change owning user and/or group""" | |
274 | kwargs = {} | |
275 | if owner is not None: | |
276 | kwargs["owner"] = owner | |
277 | if group is not None: | |
278 | kwargs["group"] = group | |
279 | self._call("SETOWNER", method="put", path=path, **kwargs) | |
280 | ||
281 | def set_replication(self, path, replication): | |
282 | """ | |
283 | Set file replication factor | |
284 | ||
285 | Parameters | |
286 | ---------- | |
287 | path: str | |
288 | File location (not for directories) | |
289 | replication: int | |
290 | Number of copies of file on the cluster. Should be smaller than | |
291 | number of data nodes; normally 3 on most systems. | |
292 | """ | |
293 | self._call("SETREPLICATION", path=path, method="put", replication=replication) | |
294 | ||
295 | def mkdir(self, path, **kwargs): | |
296 | self._call("MKDIRS", method="put", path=path) | |
297 | ||
298 | def makedirs(self, path, exist_ok=False): | |
299 | if exist_ok is False and self.exists(path): | |
300 | raise FileExistsError(path) | |
301 | self.mkdir(path) | |
302 | ||
303 | def mv(self, path1, path2, **kwargs): | |
304 | self._call("RENAME", method="put", path=path1, destination=path2) | |
305 | ||
306 | def rm(self, path, recursive=False, **kwargs): | |
307 | self._call( | |
308 | "DELETE", | |
309 | method="delete", | |
310 | path=path, | |
311 | recursive="true" if recursive else "false", | |
312 | ) | |
313 | ||
314 | def _apply_proxy(self, location): | |
315 | if self.proxy and callable(self.proxy): | |
316 | location = self.proxy(location) | |
317 | elif self.proxy: | |
318 | # as a dict | |
319 | for k, v in self.proxy.items(): | |
320 | location = location.replace(k, v, 1) | |
321 | return location | |
322 | ||
323 | ||
324 | class WebHDFile(AbstractBufferedFile): | |
325 | """A file living in HDFS over webHDFS""" | |
326 | ||
327 | def __init__(self, fs, path, **kwargs): | |
328 | super().__init__(fs, path, **kwargs) | |
329 | kwargs = kwargs.copy() | |
330 | if kwargs.get("permissions", None) is None: | |
331 | kwargs.pop("permissions", None) | |
332 | if kwargs.get("replication", None) is None: | |
333 | kwargs.pop("replication", None) | |
334 | self.permissions = kwargs.pop("permissions", 511) | |
335 | tempdir = kwargs.pop("tempdir") | |
336 | if kwargs.pop("autocommit", False) is False: | |
337 | self.target = self.path | |
338 | self.path = "/".join([tempdir, str(uuid.uuid4())]) | |
339 | ||
340 | def _upload_chunk(self, final=False): | |
341 | """ Write one part of a multi-block file upload | |
342 | ||
343 | Parameters | |
344 | ========== | |
345 | final: bool | |
346 | This is the last block, so should complete file, if | |
347 | self.autocommit is True. | |
348 | """ | |
349 | out = self.fs.session.post(self.location, data=self.buffer.getvalue()) | |
350 | out.raise_for_status() | |
351 | return True | |
352 | ||
353 | def _initiate_upload(self): | |
354 | """ Create remote file/upload """ | |
355 | if "a" in self.mode: | |
356 | op, method = "APPEND", "POST" | |
357 | else: | |
358 | op, method = "CREATE", "PUT" | |
359 | if self.fs.exists(self.path): | |
360 | # no "truncate" or "create empty" | |
361 | self.fs.rm(self.path) | |
362 | out = self.fs._call(op, method, self.path, redirect=False, **self.kwargs) | |
363 | location = self.fs._apply_proxy(out.headers["Location"]) | |
364 | if "w" in self.mode: | |
365 | # create empty file to append to | |
366 | out2 = self.fs.session.put(location) | |
367 | out2.raise_for_status() | |
368 | self.location = location.replace("CREATE", "APPEND") | |
369 | ||
370 | def _fetch_range(self, start, end): | |
371 | out = self.fs._call( | |
372 | "OPEN", path=self.path, offset=start, length=end - start, redirect=False | |
373 | ) | |
374 | out.raise_for_status() | |
375 | location = out.headers["Location"] | |
376 | out2 = self.fs.session.get(self.fs._apply_proxy(location)) | |
377 | return out2.content | |
378 | ||
379 | def commit(self): | |
380 | self.fs.mv(self.path, self.target) | |
381 | ||
382 | def discard(self): | |
383 | self.fs.rm(self.path) |
0 | from __future__ import print_function, division, absolute_import | |
1 | ||
2 | import zipfile | |
3 | from fsspec import AbstractFileSystem, open_files | |
4 | from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE | |
5 | ||
6 | ||
7 | class ZipFileSystem(AbstractFileSystem): | |
8 | """Read contents of ZIP archive as a file-system | |
9 | ||
10 | Keeps file object open while instance lives. | |
11 | ||
12 | This class is pickleable, but not necessarily thread-safe | |
13 | """ | |
14 | ||
15 | root_marker = "" | |
16 | ||
17 | def __init__(self, fo="", mode="r", **storage_options): | |
18 | """ | |
19 | Parameters | |
20 | ---------- | |
21 | fo: str or file-like | |
22 | Contains ZIP, and must exist. If a str, will fetch file using | |
23 | `open_files()`, which must return one file exactly. | |
24 | mode: str | |
25 | Currently, only 'r' accepted | |
26 | storage_options: key-value | |
27 | May be credentials, e.g., `{'auth': ('username', 'pword')}` or any | |
28 | other parameters for requests | |
29 | """ | |
30 | if self._cached: | |
31 | return | |
32 | AbstractFileSystem.__init__(self) | |
33 | if mode != "r": | |
34 | raise ValueError("Only read from zip files accepted") | |
35 | self.in_fo = fo | |
36 | if isinstance(fo, str): | |
37 | files = open_files(fo) | |
38 | if len(files) != 1: | |
39 | raise ValueError( | |
40 | 'Path "{}" did not resolve to exactly' | |
41 | 'one file: "{}"'.format(fo, files) | |
42 | ) | |
43 | fo = files[0] | |
44 | self.fo = fo.__enter__() # the whole instance is a context | |
45 | self.zip = zipfile.ZipFile(self.fo) | |
46 | self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE) | |
47 | self.dir_cache = None | |
48 | ||
49 | @classmethod | |
50 | def _strip_protocol(cls, path): | |
51 | # zip file paths are always relative to the archive root | |
52 | return super()._strip_protocol(path).lstrip("/") | |
53 | ||
54 | def _get_dirs(self): | |
55 | if self.dir_cache is None: | |
56 | files = self.zip.infolist() | |
57 | self.dir_cache = {} | |
58 | for z in files: | |
59 | f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__} | |
60 | f.update( | |
61 | { | |
62 | "name": z.filename, | |
63 | "size": z.file_size, | |
64 | "type": ("directory" if z.is_dir() else "file"), | |
65 | } | |
66 | ) | |
67 | self.dir_cache[f["name"]] = f | |
68 | ||
69 | def ls(self, path, detail=False): | |
70 | self._get_dirs() | |
71 | paths = {} | |
72 | for p, f in self.dir_cache.items(): | |
73 | p = p.rstrip("/") | |
74 | if "/" in p: | |
75 | root = p.rsplit("/", 1)[0] | |
76 | else: | |
77 | root = "" | |
78 | if root == path.rstrip("/"): | |
79 | paths[p] = f | |
80 | elif path and all( | |
81 | (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) | |
82 | ): | |
83 | # implicit directory | |
84 | ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) | |
85 | if ppath not in paths: | |
86 | out = {"name": ppath + "/", "size": 0, "type": "directory"} | |
87 | paths[ppath] = out | |
88 | ||
89 | elif all( | |
90 | (a == b) | |
91 | for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) | |
92 | ): | |
93 | # root directory entry | |
94 | ppath = p.rstrip("/").split("/", 1)[0] | |
95 | if ppath not in paths: | |
96 | out = {"name": ppath + "/", "size": 0, "type": "directory"} | |
97 | paths[ppath] = out | |
98 | out = list(paths.values()) | |
99 | if detail: | |
100 | return out | |
101 | else: | |
102 | return list(sorted(f["name"] for f in out)) | |
103 | ||
104 | def cat(self, path): | |
105 | return self.zip.read(path) | |
106 | ||
107 | def _open(self, path, mode="rb", **kwargs): | |
108 | path = self._strip_protocol(path) | |
109 | if mode != "rb": | |
110 | raise NotImplementedError | |
111 | info = self.info(path) | |
112 | out = self.zip.open(path, "r") | |
113 | out.size = info["size"] | |
114 | out.name = info["name"] | |
115 | return out | |
116 | ||
117 | def ukey(self, path): | |
118 | return tokenize(path, self.in_fo, self.protocol) |
0 | from collections.abc import MutableMapping | |
1 | from .registry import get_filesystem_class | |
2 | from .core import split_protocol | |
3 | ||
4 | ||
5 | class FSMap(MutableMapping): | |
6 | """Wrap a FileSystem instance as a mutable wrapping. | |
7 | ||
8 | The keys of the mapping become files under the given root, and the | |
9 | values (which must be bytes) the contents of those files. | |
10 | ||
11 | Parameters | |
12 | ---------- | |
13 | root: string | |
14 | prefix for all the files | |
15 | fs: FileSystem instance | |
16 | check: bool (=True) | |
17 | performs a touch at the location, to check for write access. | |
18 | ||
19 | Examples | |
20 | -------- | |
21 | >>> fs = FileSystem(**parameters) # doctest: +SKIP | |
22 | >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP | |
23 | or, more likely | |
24 | >>> d = fs.get_mapper('my-data/path/') | |
25 | ||
26 | >>> d['loc1'] = b'Hello World' # doctest: +SKIP | |
27 | >>> list(d.keys()) # doctest: +SKIP | |
28 | ['loc1'] | |
29 | >>> d['loc1'] # doctest: +SKIP | |
30 | b'Hello World' | |
31 | """ | |
32 | ||
33 | def __init__(self, root, fs, check=False, create=False): | |
34 | self.fs = fs | |
35 | self.root = fs._strip_protocol(root).rstrip( | |
36 | "/" | |
37 | ) # we join on '/' in _key_to_str | |
38 | if create: | |
39 | if not self.fs.exists(root): | |
40 | self.fs.mkdir(root) | |
41 | if check: | |
42 | if not self.fs.exists(root): | |
43 | raise ValueError( | |
44 | "Path %s does not exist. Create " | |
45 | " with the ``create=True`` keyword" % root | |
46 | ) | |
47 | self.fs.touch(root + "/a") | |
48 | self.fs.rm(root + "/a") | |
49 | ||
50 | def clear(self): | |
51 | """Remove all keys below root - empties out mapping | |
52 | """ | |
53 | try: | |
54 | self.fs.rm(self.root, True) | |
55 | self.fs.mkdir(self.root) | |
56 | except: # noqa: E722 | |
57 | pass | |
58 | ||
59 | def _key_to_str(self, key): | |
60 | """Generate full path for the key""" | |
61 | if isinstance(key, (tuple, list)): | |
62 | key = str(tuple(key)) | |
63 | else: | |
64 | key = str(key) | |
65 | return "/".join([self.root, key]) if self.root else key | |
66 | ||
67 | def _str_to_key(self, s): | |
68 | """Strip path of to leave key name""" | |
69 | return s[len(self.root) :].lstrip("/") | |
70 | ||
71 | def __getitem__(self, key, default=None): | |
72 | """Retrieve data""" | |
73 | key = self._key_to_str(key) | |
74 | try: | |
75 | result = self.fs.cat(key) | |
76 | except: # noqa: E722 | |
77 | if default is not None: | |
78 | return default | |
79 | raise KeyError(key) | |
80 | return result | |
81 | ||
82 | def pop(self, key, default=None): | |
83 | result = self.__getitem__(key, default) | |
84 | try: | |
85 | del self[key] | |
86 | except KeyError: | |
87 | pass | |
88 | return result | |
89 | ||
90 | def __setitem__(self, key, value): | |
91 | """Store value in key""" | |
92 | key = self._key_to_str(key) | |
93 | self.fs.mkdirs(self.fs._parent(key), exist_ok=True) | |
94 | with self.fs.open(key, "wb") as f: | |
95 | f.write(value) | |
96 | ||
97 | def __iter__(self): | |
98 | return (self._str_to_key(x) for x in self.fs.find(self.root)) | |
99 | ||
100 | def __len__(self): | |
101 | return len(self.fs.find(self.root)) | |
102 | ||
103 | def __delitem__(self, key): | |
104 | """Remove key""" | |
105 | try: | |
106 | self.fs.rm(self._key_to_str(key)) | |
107 | except: # noqa: E722 | |
108 | raise KeyError | |
109 | ||
110 | def __contains__(self, key): | |
111 | """Does key exist in mapping?""" | |
112 | return self.fs.exists(self._key_to_str(key)) | |
113 | ||
114 | def __getstate__(self): | |
115 | """Mapping should be pickleable""" | |
116 | # TODO: replace with reduce to reinstantiate? | |
117 | return self.fs, self.root | |
118 | ||
119 | def __setstate__(self, state): | |
120 | fs, root = state | |
121 | self.fs = fs | |
122 | self.root = root | |
123 | ||
124 | ||
125 | def get_mapper(url, check=False, create=False, **kwargs): | |
126 | """Create key-value interface for given URL and options | |
127 | ||
128 | The URL will be of the form "protocol://location" and point to the root | |
129 | of the mapper required. All keys will be file-names below this location, | |
130 | and their values the contents of each key. | |
131 | ||
132 | Parameters | |
133 | ---------- | |
134 | url: str | |
135 | Root URL of mapping | |
136 | check: bool | |
137 | Whether to attempt to read from the location before instantiation, to | |
138 | check that the mapping does exist | |
139 | create: bool | |
140 | Whether to make the directory corresponding to the root before | |
141 | instantiating | |
142 | ||
143 | Returns | |
144 | ------- | |
145 | ``FSMap`` instance, the dict-like key-value store. | |
146 | """ | |
147 | protocol, path = split_protocol(url) | |
148 | cls = get_filesystem_class(protocol) | |
149 | fs = cls(**kwargs) | |
150 | # Removing protocol here - could defer to each open() on the backend | |
151 | return FSMap(url, fs, check, create) |
0 | import importlib | |
1 | from distutils.version import LooseVersion | |
2 | ||
3 | __all__ = ["registry", "get_filesystem_class", "default"] | |
4 | ||
5 | # mapping protocol: implementation class object | |
6 | registry = {} | |
7 | default = "file" | |
8 | ||
9 | # protocols mapped to the class which implements them. This dict can | |
10 | # be dynamically updated. | |
11 | known_implementations = { | |
12 | "file": {"class": "fsspec.implementations.local.LocalFileSystem"}, | |
13 | "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"}, | |
14 | "http": { | |
15 | "class": "fsspec.implementations.http.HTTPFileSystem", | |
16 | "err": 'HTTPFileSystem requires "requests" to be installed', | |
17 | }, | |
18 | "https": { | |
19 | "class": "fsspec.implementations.http.HTTPFileSystem", | |
20 | "err": 'HTTPFileSystem requires "requests" to be installed', | |
21 | }, | |
22 | "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"}, | |
23 | "gcs": { | |
24 | "class": "gcsfs.GCSFileSystem", | |
25 | "err": "Please install gcsfs to access Google Storage", | |
26 | }, | |
27 | "gs": { | |
28 | "class": "gcsfs.GCSFileSystem", | |
29 | "err": "Please install gcsfs to access Google Storage", | |
30 | }, | |
31 | "sftp": { | |
32 | "class": "fsspec.implementations.sftp.SFTPFileSystem", | |
33 | "err": 'SFTPFileSystem requires "paramiko" to be installed', | |
34 | }, | |
35 | "ssh": { | |
36 | "class": "fsspec.implementations.sftp.SFTPFileSystem", | |
37 | "err": 'SFTPFileSystem requires "paramiko" to be installed', | |
38 | }, | |
39 | "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"}, | |
40 | "hdfs": { | |
41 | "class": "fsspec.implementations.hdfs.PyArrowHDFS", | |
42 | "err": "pyarrow and local java libraries required for HDFS", | |
43 | }, | |
44 | "webhdfs": { | |
45 | "class": "fsspec.implementations.webhdfs.WebHDFS", | |
46 | "err": 'webHDFS access requires "requests" to be installed', | |
47 | }, | |
48 | "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, | |
49 | "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"}, | |
50 | "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"}, | |
51 | "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"}, | |
52 | "dask": { | |
53 | "class": "fsspec.implementations.dask.DaskWorkerFileSystem", | |
54 | "err": "Install dask distributed to access worker file system", | |
55 | }, | |
56 | } | |
57 | ||
58 | minversions = {"s3fs": LooseVersion("0.3.0"), "gcsfs": LooseVersion("0.3.0")} | |
59 | ||
60 | ||
61 | def get_filesystem_class(protocol): | |
62 | """Fetch named protocol implementation from the registry | |
63 | ||
64 | The dict ``known_implementations`` maps protocol names to the locations | |
65 | of classes implementing the corresponding file-system. When used for the | |
66 | first time, appropriate imports will happen and the class will be placed in | |
67 | the registry. All subsequent calls will fetch directly from the registry. | |
68 | ||
69 | Some protocol implementations require additional dependencies, and so the | |
70 | import may fail. In this case, the string in the "err" field of the | |
71 | ``known_implementations`` will be given as the error message. | |
72 | """ | |
73 | if protocol is None: | |
74 | protocol = default | |
75 | ||
76 | if protocol not in registry: | |
77 | if protocol not in known_implementations: | |
78 | raise ValueError("Protocol not known: %s" % protocol) | |
79 | bit = known_implementations[protocol] | |
80 | mod, name = bit["class"].rsplit(".", 1) | |
81 | minversion = minversions.get(mod, None) | |
82 | err = None | |
83 | try: | |
84 | mod = importlib.import_module(mod) | |
85 | except ImportError: | |
86 | err = ImportError(bit["err"]) | |
87 | ||
88 | except Exception as e: | |
89 | err = e | |
90 | if err is not None: | |
91 | raise RuntimeError(str(err)) | |
92 | ||
93 | if minversion: | |
94 | version = getattr(mod, "__version__", None) | |
95 | if version and LooseVersion(version) < minversion: | |
96 | raise RuntimeError( | |
97 | "'{}={}' is installed, but version '{}' or " | |
98 | "higher is required".format(mod.__name__, version, minversion) | |
99 | ) | |
100 | registry[protocol] = getattr(mod, name) | |
101 | cls = registry[protocol] | |
102 | if getattr(cls, "protocol", None) in ("abstract", None): | |
103 | cls.protocol = protocol | |
104 | ||
105 | return cls | |
106 | ||
107 | ||
108 | def filesystem(protocol, **storage_options): | |
109 | """Instantiate filesystems for given protocol and arguments | |
110 | ||
111 | ``storage_options`` are specific to the protocol being chosen, and are | |
112 | passed directly to the class. | |
113 | """ | |
114 | cls = get_filesystem_class(protocol) | |
115 | return cls(**storage_options) |
0 | import warnings | |
1 | from hashlib import md5 | |
2 | import io | |
3 | import os | |
4 | import logging | |
5 | ||
6 | from .transaction import Transaction | |
7 | from .utils import read_block, tokenize, stringify_path | |
8 | ||
9 | logger = logging.getLogger("fsspec") | |
10 | ||
11 | ||
12 | def make_instance(cls, args, kwargs): | |
13 | return cls(*args, **kwargs) | |
14 | ||
15 | ||
16 | class _Cached(type): | |
17 | """ | |
18 | Metaclass for caching file system instances. | |
19 | ||
20 | Notes | |
21 | ----- | |
22 | Instances are cached according to | |
23 | ||
24 | * The values of the class attributes listed in `_extra_tokenize_attributes` | |
25 | * The arguments passed to ``__init__``. | |
26 | ||
27 | This creates an additional reference to the filesystem, which prevents the | |
28 | filesystem from being garbage collected when all *user* references go away. | |
29 | A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* | |
30 | be made for a filesystem instance to be garbage collected. | |
31 | """ | |
32 | ||
33 | cachable = True | |
34 | _extra_tokenize_attributes = () | |
35 | ||
36 | def __init__(self, *args, **kwargs): | |
37 | super().__init__(*args, **kwargs) | |
38 | # Note: we intentionally create a reference here, to avoid garbage | |
39 | # collecting instances when all other references are gone. To really | |
40 | # delete a FileSystem, the cache must be cleared. | |
41 | self._cache = {} | |
42 | ||
43 | def __call__(self, *args, **kwargs): | |
44 | cls = type(self) | |
45 | extra_tokens = tuple( | |
46 | getattr(self, attr, None) for attr in self._extra_tokenize_attributes | |
47 | ) | |
48 | token = tokenize(cls, *args, *extra_tokens, **kwargs) | |
49 | if self.cachable and token in self._cache: | |
50 | return self._cache[token] | |
51 | else: | |
52 | obj = super().__call__(*args, **kwargs) | |
53 | # Setting _fs_token here causes some static linters to complain. | |
54 | obj._fs_token_ = token | |
55 | self.storage_args = args | |
56 | self.storage_options = kwargs | |
57 | ||
58 | if self.cachable: | |
59 | self._cache[token] = obj | |
60 | return obj | |
61 | ||
62 | ||
63 | try: # optionally derive from pyarrow's FileSystem, if available | |
64 | import pyarrow as pa | |
65 | ||
66 | up = pa.filesystem.DaskFileSystem | |
67 | except ImportError: | |
68 | up = object | |
69 | ||
70 | ||
71 | class AbstractFileSystem(up, metaclass=_Cached): | |
72 | """ | |
73 | An abstract super-class for pythonic file-systems | |
74 | ||
75 | Implementations are expected to be compatible with or, better, subclass | |
76 | from here. | |
77 | """ | |
78 | ||
79 | cachable = True # this class can be cached, instances reused | |
80 | _cached = False | |
81 | blocksize = 2 ** 22 | |
82 | sep = "/" | |
83 | protocol = "abstract" | |
84 | root_marker = "" # For some FSs, may require leading '/' or other character | |
85 | ||
86 | #: Extra *class attributes* that should be considered when hashing. | |
87 | _extra_tokenize_attributes = () | |
88 | ||
89 | def __init__(self, *args, **storage_options): | |
90 | """Create and configure file-system instance | |
91 | ||
92 | Instances may be cachable, so if similar enough arguments are seen | |
93 | a new instance is not required. The token attribute exists to allow | |
94 | implementations to cache instances if they wish. | |
95 | ||
96 | A reasonable default should be provided if there are no arguments. | |
97 | ||
98 | Subclasses should call this method. | |
99 | ||
100 | Magic kwargs that affect functionality here: | |
101 | add_docs: if True, will append docstrings from this spec to the | |
102 | specific implementation | |
103 | """ | |
104 | if self._cached: | |
105 | # reusing instance, don't change | |
106 | return | |
107 | self._cached = True | |
108 | self._intrans = False | |
109 | self._transaction = None | |
110 | self.dircache = {} | |
111 | ||
112 | if storage_options.pop("add_docs", None): | |
113 | warnings.warn("add_docs is no longer supported.", FutureWarning) | |
114 | ||
115 | if storage_options.pop("add_aliases", None): | |
116 | warnings.warn("add_aliases has been removed.", FutureWarning) | |
117 | # This is set in _Cached | |
118 | self._fs_token_ = None | |
119 | ||
120 | @property | |
121 | def _fs_token(self): | |
122 | return self._fs_token_ | |
123 | ||
124 | def __dask_tokenize__(self): | |
125 | return self._fs_token | |
126 | ||
127 | def __hash__(self): | |
128 | return int(self._fs_token, 16) | |
129 | ||
130 | def __eq__(self, other): | |
131 | return isinstance(other, type(self)) and self._fs_token == other._fs_token | |
132 | ||
133 | @classmethod | |
134 | def _strip_protocol(cls, path): | |
135 | """ Turn path from fully-qualified to file-system-specific | |
136 | ||
137 | May require FS-specific handling, e.g., for relative paths or links. | |
138 | """ | |
139 | path = stringify_path(path) | |
140 | protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol | |
141 | for protocol in protos: | |
142 | path = path.rstrip("/") | |
143 | if path.startswith(protocol + "://"): | |
144 | path = path[len(protocol) + 3 :] | |
145 | elif path.startswith(protocol + ":"): | |
146 | path = path[len(protocol) + 1 :] | |
147 | # use of root_marker to make minimum required path, e.g., "/" | |
148 | return path or cls.root_marker | |
149 | ||
150 | @staticmethod | |
151 | def _get_kwargs_from_urls(paths): | |
152 | """If kwargs can be encoded in the paths, extract them here | |
153 | ||
154 | This should happen before instantiation of the class; incoming paths | |
155 | then should be amended to strip the options in methods. | |
156 | ||
157 | Examples may look like an sftp path "sftp://user@host:/my/path", where | |
158 | the user and host should become kwargs and later get stripped. | |
159 | """ | |
160 | # by default, nothing happens | |
161 | return {} | |
162 | ||
163 | @classmethod | |
164 | def current(cls): | |
165 | """ Return the most recently created FileSystem | |
166 | ||
167 | If no instance has been created, then create one with defaults | |
168 | """ | |
169 | if not len(cls._cache): | |
170 | return cls() | |
171 | else: | |
172 | return list(cls._cache.values())[-1] | |
173 | ||
174 | @property | |
175 | def transaction(self): | |
176 | """A context within which files are committed together upon exit | |
177 | ||
178 | Requires the file class to implement `.commit()` and `.discard()` | |
179 | for the normal and exception cases. | |
180 | """ | |
181 | if self._transaction is None: | |
182 | self._transaction = Transaction(self) | |
183 | return self._transaction | |
184 | ||
185 | def start_transaction(self): | |
186 | """Begin write transaction for deferring files, non-context version""" | |
187 | self._intrans = True | |
188 | self._transaction = Transaction(self) | |
189 | return self.transaction | |
190 | ||
191 | def end_transaction(self): | |
192 | """Finish write transaction, non-context version""" | |
193 | self.transaction.complete() | |
194 | self._transaction = None | |
195 | ||
196 | def invalidate_cache(self, path=None): | |
197 | """ | |
198 | Discard any cached directory information | |
199 | ||
200 | Parameters | |
201 | ---------- | |
202 | path: string or None | |
203 | If None, clear all listings cached else listings at or under given | |
204 | path. | |
205 | """ | |
206 | pass # not necessary to implement, may have no cache | |
207 | ||
208 | def mkdir(self, path, create_parents=True, **kwargs): | |
209 | """ | |
210 | Create directory entry at path | |
211 | ||
212 | For systems that don't have true directories, may create an for | |
213 | this instance only and not touch the real filesystem | |
214 | ||
215 | Parameters | |
216 | ---------- | |
217 | path: str | |
218 | location | |
219 | create_parents: bool | |
220 | if True, this is equivalent to ``makedirs`` | |
221 | kwargs: | |
222 | may be permissions, etc. | |
223 | """ | |
224 | pass # not necessary to implement, may not have directories | |
225 | ||
226 | def makedirs(self, path, exist_ok=False): | |
227 | """Recursively make directories | |
228 | ||
229 | Creates directory at path and any intervening required directories. | |
230 | Raises exception if, for instance, the path already exists but is a | |
231 | file. | |
232 | ||
233 | Parameters | |
234 | ---------- | |
235 | path: str | |
236 | leaf directory name | |
237 | exist_ok: bool (False) | |
238 | If True, will error if the target already exists | |
239 | """ | |
240 | pass # not necessary to implement, may not have directories | |
241 | ||
242 | def rmdir(self, path): | |
243 | """Remove a directory, if empty""" | |
244 | pass # not necessary to implement, may not have directories | |
245 | ||
246 | def ls(self, path, detail=True, **kwargs): | |
247 | """List objects at path. | |
248 | ||
249 | This should include subdirectories and files at that location. The | |
250 | difference between a file and a directory must be clear when details | |
251 | are requested. | |
252 | ||
253 | The specific keys, or perhaps a FileInfo class, or similar, is TBD, | |
254 | but must be consistent across implementations. | |
255 | Must include: | |
256 | - full path to the entry (without protocol) | |
257 | - size of the entry, in bytes. If the value cannot be determined, will | |
258 | be ``None``. | |
259 | - type of entry, "file", "directory" or other | |
260 | ||
261 | Additional information | |
262 | may be present, aproriate to the file-system, e.g., generation, | |
263 | checksum, etc. | |
264 | ||
265 | May use refresh=True|False to allow use of self._ls_from_cache to | |
266 | check for a saved listing and avoid calling the backend. This would be | |
267 | common where listing may be expensive. | |
268 | ||
269 | Parameters | |
270 | ---------- | |
271 | path: str | |
272 | detail: bool | |
273 | if True, gives a list of dictionaries, where each is the same as | |
274 | the result of ``info(path)``. If False, gives a list of paths | |
275 | (str). | |
276 | kwargs: may have additional backend-specific options, such as version | |
277 | information | |
278 | ||
279 | Returns | |
280 | ------- | |
281 | List of strings if detail is False, or list of directory information | |
282 | dicts if detail is True. | |
283 | """ | |
284 | raise NotImplementedError | |
285 | ||
286 | def _ls_from_cache(self, path): | |
287 | """Check cache for listing | |
288 | ||
289 | Returns listing, if found (may me empty list for a directly that exists | |
290 | but contains nothing), None if not in cache. | |
291 | """ | |
292 | parent = self._parent(path) | |
293 | if path in self.dircache: | |
294 | return self.dircache[path] | |
295 | elif parent in self.dircache: | |
296 | files = [f for f in self.dircache[parent] if f["name"] == path] | |
297 | if len(files) == 0: | |
298 | # parent dir was listed but did not contain this file | |
299 | raise FileNotFoundError(path) | |
300 | return files | |
301 | ||
302 | def walk(self, path, maxdepth=None, **kwargs): | |
303 | """ Return all files belows path | |
304 | ||
305 | List all files, recursing into subdirectories; output is iterator-style, | |
306 | like ``os.walk()``. For a simple list of files, ``find()`` is available. | |
307 | ||
308 | Note that the "files" outputted will include anything that is not | |
309 | a directory, such as links. | |
310 | ||
311 | Parameters | |
312 | ---------- | |
313 | path: str | |
314 | Root to recurse into | |
315 | maxdepth: int | |
316 | Maximum recursion depth. None means limitless, but not recommended | |
317 | on link-based file-systems. | |
318 | kwargs: passed to ``ls`` | |
319 | """ | |
320 | path = self._strip_protocol(path) | |
321 | full_dirs = [] | |
322 | dirs = [] | |
323 | files = [] | |
324 | ||
325 | try: | |
326 | listing = self.ls(path, detail=True, **kwargs) | |
327 | except (FileNotFoundError, IOError): | |
328 | return [], [], [] | |
329 | ||
330 | for info in listing: | |
331 | # each info name must be at least [path]/part , but here | |
332 | # we check also for names like [path]/part/ | |
333 | name = info["name"].rstrip("/") | |
334 | if info["type"] == "directory" and name != path: | |
335 | # do not include "self" path | |
336 | full_dirs.append(name) | |
337 | dirs.append(name.rsplit("/", 1)[-1]) | |
338 | elif name == path: | |
339 | # file-like with same name as give path | |
340 | files.append("") | |
341 | else: | |
342 | files.append(name.rsplit("/", 1)[-1]) | |
343 | yield path, dirs, files | |
344 | ||
345 | for d in full_dirs: | |
346 | if maxdepth is None or maxdepth > 1: | |
347 | for res in self.walk( | |
348 | d, | |
349 | maxdepth=(maxdepth - 1) if maxdepth is not None else None, | |
350 | **kwargs | |
351 | ): | |
352 | yield res | |
353 | ||
354 | def find(self, path, maxdepth=None, withdirs=False, **kwargs): | |
355 | """List all files below path. | |
356 | ||
357 | Like posix ``find`` command without conditions | |
358 | ||
359 | Parameters | |
360 | ---------- | |
361 | path : str | |
362 | maxdepth: int or None | |
363 | If not None, the maximum number of levels to descend | |
364 | withdirs: bool | |
365 | Whether to include directory paths in the output. This is True | |
366 | when used by glob, but users usually only want files. | |
367 | kwargs are passed to ``ls``. | |
368 | """ | |
369 | # TODO: allow equivalent of -name parameter | |
370 | out = set() | |
371 | for path, dirs, files in self.walk(path, maxdepth, **kwargs): | |
372 | if withdirs: | |
373 | files += dirs | |
374 | for name in files: | |
375 | if name and name not in out: | |
376 | out.add("/".join([path.rstrip("/"), name]) if path else name) | |
377 | if self.isfile(path) and path not in out: | |
378 | # walk works on directories, but find should also return [path] | |
379 | # when path happens to be a file | |
380 | out.add(path) | |
381 | return sorted(out) | |
382 | ||
383 | def du(self, path, total=True, maxdepth=None, **kwargs): | |
384 | """Space used by files within a path | |
385 | ||
386 | Parameters | |
387 | ---------- | |
388 | path: str | |
389 | total: bool | |
390 | whether to sum all the file sizes | |
391 | maxdepth: int or None | |
392 | maximum number of directory levels to descend, None for unlimited. | |
393 | kwargs: passed to ``ls`` | |
394 | ||
395 | Returns | |
396 | ------- | |
397 | Dict of {fn: size} if total=False, or int otherwise, where numbers | |
398 | refer to bytes used. | |
399 | """ | |
400 | sizes = {} | |
401 | for f in self.find(path, maxdepth=maxdepth, **kwargs): | |
402 | info = self.info(f) | |
403 | sizes[info["name"]] = info["size"] | |
404 | if total: | |
405 | return sum(sizes.values()) | |
406 | else: | |
407 | return sizes | |
408 | ||
409 | def glob(self, path, **kwargs): | |
410 | """ | |
411 | Find files by glob-matching. | |
412 | ||
413 | If the path ends with '/' and does not contain "*", it is essentially | |
414 | the same as ``ls(path)``, returning only files. | |
415 | ||
416 | We support ``"**"``, | |
417 | ``"?"`` and ``"[..]"``. | |
418 | ||
419 | kwargs are passed to ``ls``. | |
420 | """ | |
421 | import re | |
422 | from glob import has_magic | |
423 | ||
424 | ends = path.endswith("/") | |
425 | path = self._strip_protocol(path) | |
426 | indstar = path.find("*") if path.find("*") >= 0 else len(path) | |
427 | indques = path.find("?") if path.find("?") >= 0 else len(path) | |
428 | indbrace = path.find("[") if path.find("[") >= 0 else len(path) | |
429 | ||
430 | ind = min(indstar, indques, indbrace) | |
431 | ||
432 | if not has_magic(path): | |
433 | root = path | |
434 | depth = 1 | |
435 | if ends: | |
436 | path += "/*" | |
437 | elif self.exists(path): | |
438 | return [path] | |
439 | else: | |
440 | return [] # glob of non-existent returns empty | |
441 | elif "/" in path[:ind]: | |
442 | ind2 = path[:ind].rindex("/") | |
443 | root = path[: ind2 + 1] | |
444 | depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1 | |
445 | else: | |
446 | root = "" | |
447 | depth = 20 if "**" in path else 1 | |
448 | allpaths = self.find(root, maxdepth=depth, withdirs=True, **kwargs) | |
449 | pattern = ( | |
450 | "^" | |
451 | + ( | |
452 | path.replace("\\", r"\\") | |
453 | .replace(".", r"\.") | |
454 | .replace("+", r"\+") | |
455 | .replace("//", "/") | |
456 | .replace("(", r"\(") | |
457 | .replace(")", r"\)") | |
458 | .replace("|", r"\|") | |
459 | .rstrip("/") | |
460 | .replace("?", ".") | |
461 | ) | |
462 | + "$" | |
463 | ) | |
464 | pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern) | |
465 | pattern = re.sub("[*]", "[^/]*", pattern) | |
466 | pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*")) | |
467 | out = {p for p in allpaths if pattern.match(p.replace("//", "/").rstrip("/"))} | |
468 | return list(sorted(out)) | |
469 | ||
470 | def exists(self, path): | |
471 | """Is there a file at the given path""" | |
472 | try: | |
473 | self.info(path) | |
474 | return True | |
475 | except: # noqa: E722 | |
476 | # any exception allowed bar FileNotFoundError? | |
477 | return False | |
478 | ||
479 | def info(self, path, **kwargs): | |
480 | """Give details of entry at path | |
481 | ||
482 | Returns a single dictionary, with exactly the same information as ``ls`` | |
483 | would with ``detail=True``. | |
484 | ||
485 | The default implementation should calls ls and could be overridden by a | |
486 | shortcut. kwargs are passed on to ```ls()``. | |
487 | ||
488 | Some file systems might not be able to measure the file's size, in | |
489 | which case, the returned dict will include ``'size': None``. | |
490 | ||
491 | Returns | |
492 | ------- | |
493 | dict with keys: name (full path in the FS), size (in bytes), type (file, | |
494 | directory, or something else) and other FS-specific keys. | |
495 | """ | |
496 | path = self._strip_protocol(path) | |
497 | out = self.ls(self._parent(path), detail=True, **kwargs) | |
498 | out = [o for o in out if o["name"].rstrip("/") == path] | |
499 | if out: | |
500 | return out[0] | |
501 | out = self.ls(path, detail=True, **kwargs) | |
502 | path = path.rstrip("/") | |
503 | out1 = [o for o in out if o["name"].rstrip("/") == path] | |
504 | if len(out1) == 1: | |
505 | if "size" not in out1[0]: | |
506 | out1[0]["size"] = None | |
507 | return out1[0] | |
508 | elif len(out1) > 1 or out: | |
509 | return {"name": path, "size": 0, "type": "directory"} | |
510 | else: | |
511 | raise FileNotFoundError(path) | |
512 | ||
513 | def checksum(self, path): | |
514 | """Unique value for current version of file | |
515 | ||
516 | If the checksum is the same from one moment to another, the contents | |
517 | are guaranteed to be the same. If the checksum changes, the contents | |
518 | *might* have changed. | |
519 | ||
520 | This should normally be overridden; default will probably capture | |
521 | creation/modification timestamp (which would be good) or maybe | |
522 | access timestamp (which would be bad) | |
523 | """ | |
524 | return int(tokenize(self.info(path)), 16) | |
525 | ||
526 | def size(self, path): | |
527 | """Size in bytes of file""" | |
528 | return self.info(path).get("size", None) | |
529 | ||
530 | def isdir(self, path): | |
531 | """Is this entry directory-like?""" | |
532 | try: | |
533 | return self.info(path)["type"] == "directory" | |
534 | except FileNotFoundError: | |
535 | return False | |
536 | ||
537 | def isfile(self, path): | |
538 | """Is this entry file-like?""" | |
539 | try: | |
540 | return self.info(path)["type"] == "file" | |
541 | except: # noqa: E722 | |
542 | return False | |
543 | ||
544 | def cat(self, path): | |
545 | """ Get the content of a file """ | |
546 | return self.open(path, "rb").read() | |
547 | ||
548 | def get(self, rpath, lpath, recursive=False, **kwargs): | |
549 | """Copy file to local. | |
550 | ||
551 | Possible extension: maybe should be able to copy to any file-system | |
552 | (streaming through local). | |
553 | """ | |
554 | rpath = self._strip_protocol(rpath) | |
555 | if recursive: | |
556 | rpaths = self.find(rpath) | |
557 | lpaths = [ | |
558 | os.path.join(lpath, path[len(rpath) :].lstrip("/")) for path in rpaths | |
559 | ] | |
560 | for lpath in lpaths: | |
561 | dirname = os.path.dirname(lpath) | |
562 | if not os.path.isdir(dirname): | |
563 | os.makedirs(dirname) | |
564 | else: | |
565 | rpaths = [rpath] | |
566 | lpaths = [lpath] | |
567 | for lpath, rpath in zip(lpaths, rpaths): | |
568 | with self.open(rpath, "rb", **kwargs) as f1: | |
569 | with open(lpath, "wb") as f2: | |
570 | data = True | |
571 | while data: | |
572 | data = f1.read(self.blocksize) | |
573 | f2.write(data) | |
574 | ||
575 | def put(self, lpath, rpath, recursive=False, **kwargs): | |
576 | """ Upload file from local """ | |
577 | if recursive: | |
578 | lpaths = [] | |
579 | for dirname, subdirlist, filelist in os.walk(lpath): | |
580 | lpaths += [os.path.join(dirname, filename) for filename in filelist] | |
581 | rootdir = os.path.basename(lpath.rstrip("/")) | |
582 | if self.exists(rpath): | |
583 | # copy lpath inside rpath directory | |
584 | rpath2 = os.path.join(rpath, rootdir) | |
585 | else: | |
586 | # copy lpath as rpath directory | |
587 | rpath2 = rpath | |
588 | rpaths = [ | |
589 | os.path.join(rpath2, path[len(lpath) :].lstrip("/")) for path in lpaths | |
590 | ] | |
591 | else: | |
592 | lpaths = [lpath] | |
593 | rpaths = [rpath] | |
594 | for lpath, rpath in zip(lpaths, rpaths): | |
595 | with open(lpath, "rb") as f1: | |
596 | with self.open(rpath, "wb", **kwargs) as f2: | |
597 | data = True | |
598 | while data: | |
599 | data = f1.read(self.blocksize) | |
600 | f2.write(data) | |
601 | ||
602 | def head(self, path, size=1024): | |
603 | """ Get the first ``size`` bytes from file """ | |
604 | with self.open(path, "rb") as f: | |
605 | return f.read(size) | |
606 | ||
607 | def tail(self, path, size=1024): | |
608 | """ Get the last ``size`` bytes from file """ | |
609 | with self.open(path, "rb") as f: | |
610 | f.seek(max(-size, -f.size), 2) | |
611 | return f.read() | |
612 | ||
613 | def copy(self, path1, path2, **kwargs): | |
614 | """ Copy within two locations in the filesystem""" | |
615 | raise NotImplementedError | |
616 | ||
617 | def mv(self, path1, path2, **kwargs): | |
618 | """ Move file from one location to another """ | |
619 | self.copy(path1, path2, **kwargs) | |
620 | self.rm(path1, recursive=False) | |
621 | ||
622 | def _rm(self, path): | |
623 | """Delete a file""" | |
624 | raise NotImplementedError | |
625 | ||
626 | def rm(self, path, recursive=False, maxdepth=None): | |
627 | """Delete files. | |
628 | ||
629 | Parameters | |
630 | ---------- | |
631 | path: str or list of str | |
632 | File(s) to delete. | |
633 | recursive: bool | |
634 | If file(s) are directories, recursively delete contents and then | |
635 | also remove the directory | |
636 | maxdepth: int or None | |
637 | Depth to pass to walk for finding files to delete, if recursive. | |
638 | If None, there will be no limit and infinite recursion may be | |
639 | possible. | |
640 | """ | |
641 | # prefer some bulk method, if possible | |
642 | if not isinstance(path, list): | |
643 | path = [path] | |
644 | for p in path: | |
645 | if recursive: | |
646 | out = self.walk(p, maxdepth=maxdepth) | |
647 | for pa_, _, files in reversed(list(out)): | |
648 | for name in files: | |
649 | fn = "/".join([pa_, name]) if pa_ else name | |
650 | self.rm(fn) | |
651 | self.rmdir(pa_) | |
652 | else: | |
653 | self._rm(p) | |
654 | ||
655 | @classmethod | |
656 | def _parent(cls, path): | |
657 | path = cls._strip_protocol(path.rstrip("/")) | |
658 | if "/" in path: | |
659 | return cls.root_marker + path.rsplit("/", 1)[0] | |
660 | else: | |
661 | return cls.root_marker | |
662 | ||
663 | def _open( | |
664 | self, | |
665 | path, | |
666 | mode="rb", | |
667 | block_size=None, | |
668 | autocommit=True, | |
669 | cache_options=None, | |
670 | **kwargs | |
671 | ): | |
672 | """Return raw bytes-mode file-like from the file-system""" | |
673 | return AbstractBufferedFile( | |
674 | self, | |
675 | path, | |
676 | mode, | |
677 | block_size, | |
678 | autocommit, | |
679 | cache_options=cache_options, | |
680 | **kwargs | |
681 | ) | |
682 | ||
683 | def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs): | |
684 | """ | |
685 | Return a file-like object from the filesystem | |
686 | ||
687 | The resultant instance must function correctly in a context ``with`` | |
688 | block. | |
689 | ||
690 | Parameters | |
691 | ---------- | |
692 | path: str | |
693 | Target file | |
694 | mode: str like 'rb', 'w' | |
695 | See builtin ``open()`` | |
696 | block_size: int | |
697 | Some indication of buffering - this is a value in bytes | |
698 | cache_options : dict, optional | |
699 | Extra arguments to pass through to the cache. | |
700 | encoding, errors, newline: passed on to TextIOWrapper for text mode | |
701 | """ | |
702 | import io | |
703 | ||
704 | path = self._strip_protocol(path) | |
705 | if "b" not in mode: | |
706 | mode = mode.replace("t", "") + "b" | |
707 | ||
708 | text_kwargs = { | |
709 | k: kwargs.pop(k) | |
710 | for k in ["encoding", "errors", "newline"] | |
711 | if k in kwargs | |
712 | } | |
713 | return io.TextIOWrapper( | |
714 | self.open(path, mode, block_size, **kwargs), **text_kwargs | |
715 | ) | |
716 | else: | |
717 | ac = kwargs.pop("autocommit", not self._intrans) | |
718 | f = self._open( | |
719 | path, | |
720 | mode=mode, | |
721 | block_size=block_size, | |
722 | autocommit=ac, | |
723 | cache_options=cache_options, | |
724 | **kwargs | |
725 | ) | |
726 | if not ac: | |
727 | self.transaction.files.append(f) | |
728 | return f | |
729 | ||
730 | def touch(self, path, truncate=True, **kwargs): | |
731 | """ Create empty file, or update timestamp | |
732 | ||
733 | Parameters | |
734 | ---------- | |
735 | path: str | |
736 | file location | |
737 | truncate: bool | |
738 | If True, always set file size to 0; if False, update timestamp and | |
739 | leave file unchanged, if backend allows this | |
740 | """ | |
741 | if truncate or not self.exists(path): | |
742 | with self.open(path, "wb", **kwargs): | |
743 | pass | |
744 | else: | |
745 | raise NotImplementedError # update timestamp, if possible | |
746 | ||
747 | def ukey(self, path): | |
748 | """Hash of file properties, to tell if it has changed""" | |
749 | return md5(str(self.info(path)).encode()).hexdigest() | |
750 | ||
751 | def read_block(self, fn, offset, length, delimiter=None): | |
752 | """ Read a block of bytes from | |
753 | ||
754 | Starting at ``offset`` of the file, read ``length`` bytes. If | |
755 | ``delimiter`` is set then we ensure that the read starts and stops at | |
756 | delimiter boundaries that follow the locations ``offset`` and ``offset | |
757 | + length``. If ``offset`` is zero then we start at zero. The | |
758 | bytestring returned WILL include the end delimiter string. | |
759 | ||
760 | If offset+length is beyond the eof, reads to eof. | |
761 | ||
762 | Parameters | |
763 | ---------- | |
764 | fn: string | |
765 | Path to filename | |
766 | offset: int | |
767 | Byte offset to start read | |
768 | length: int | |
769 | Number of bytes to read | |
770 | delimiter: bytes (optional) | |
771 | Ensure reading starts and stops at delimiter bytestring | |
772 | ||
773 | Examples | |
774 | -------- | |
775 | >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP | |
776 | b'Alice, 100\\nBo' | |
777 | >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP | |
778 | b'Alice, 100\\nBob, 200\\n' | |
779 | ||
780 | Use ``length=None`` to read to the end of the file. | |
781 | >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP | |
782 | b'Alice, 100\\nBob, 200\\nCharlie, 300' | |
783 | ||
784 | See Also | |
785 | -------- | |
786 | utils.read_block | |
787 | """ | |
788 | with self.open(fn, "rb") as f: | |
789 | size = f.size | |
790 | if length is None: | |
791 | length = size | |
792 | if size is not None and offset + length > size: | |
793 | length = size - offset | |
794 | return read_block(f, offset, length, delimiter) | |
795 | ||
796 | def __reduce__(self): | |
797 | return make_instance, (type(self), self.storage_args, self.storage_options) | |
798 | ||
799 | def _get_pyarrow_filesystem(self): | |
800 | """ | |
801 | Make a version of the FS instance which will be acceptable to pyarrow | |
802 | """ | |
803 | # all instances already also derive from pyarrow | |
804 | return self | |
805 | ||
806 | def get_mapper(self, root, check=False, create=False): | |
807 | """Create key/value store based on this file-system | |
808 | ||
809 | Makes a MutibleMapping interface to the FS at the given root path. | |
810 | See ``fsspec.mapping.FSMap`` for further details. | |
811 | """ | |
812 | from .mapping import FSMap | |
813 | ||
814 | return FSMap(root, self, check, create) | |
815 | ||
816 | @classmethod | |
817 | def clear_instance_cache(cls): | |
818 | """ | |
819 | Clear the cache of filesystem instances. | |
820 | ||
821 | Notes | |
822 | ----- | |
823 | Unless overridden by setting the ``cachable`` class attribute to False, | |
824 | the filesystem class stores a reference to newly created instances. This | |
825 | prevents Python's normal rules around garbage collection from working, | |
826 | since the instances refcount will not drop to zero until | |
827 | ``clear_instance_cache`` is called. | |
828 | """ | |
829 | cls._cache.clear() | |
830 | ||
831 | # ------------------------------------------------------------------------ | |
832 | # Aliases | |
833 | ||
834 | def makedir(self, path, create_parents=True, **kwargs): | |
835 | """Alias of :ref:`FilesystemSpec.mkdir`.""" | |
836 | return self.mkdir(path, create_parents=create_parents, **kwargs) | |
837 | ||
838 | def mkdirs(self, path, exist_ok=False): | |
839 | """Alias of :ref:`FilesystemSpec.makedirs`.""" | |
840 | return self.makedirs(path, exist_ok=exist_ok) | |
841 | ||
842 | def listdir(self, path, detail=True, **kwargs): | |
843 | """Alias of :ref:`FilesystemSpec.ls`.""" | |
844 | return self.ls(path, detail=detail, **kwargs) | |
845 | ||
846 | def cp(self, path1, path2, **kwargs): | |
847 | """Alias of :ref:`FilesystemSpec.copy`.""" | |
848 | return self.copy(path1, path2, **kwargs) | |
849 | ||
850 | def move(self, path1, path2, **kwargs): | |
851 | """Alias of :ref:`FilesystemSpec.mv`.""" | |
852 | return self.mv(path1, path2, **kwargs) | |
853 | ||
854 | def stat(self, path, **kwargs): | |
855 | """Alias of :ref:`FilesystemSpec.info`.""" | |
856 | return self.info(path, **kwargs) | |
857 | ||
858 | def disk_usage(self, path, total=True, maxdepth=None, **kwargs): | |
859 | """Alias of :ref:`FilesystemSpec.du`.""" | |
860 | return self.du(path, total=total, maxdepth=maxdepth, **kwargs) | |
861 | ||
862 | def rename(self, path1, path2, **kwargs): | |
863 | """Alias of :ref:`FilesystemSpec.mv`.""" | |
864 | return self.mv(path1, path2, **kwargs) | |
865 | ||
866 | def delete(self, path, recursive=False, maxdepth=None): | |
867 | """Alias of :ref:`FilesystemSpec.rm`.""" | |
868 | return self.rm(path, recursive=recursive, maxdepth=maxdepth) | |
869 | ||
870 | def upload(self, lpath, rpath, recursive=False, **kwargs): | |
871 | """Alias of :ref:`FilesystemSpec.put`.""" | |
872 | return self.put(lpath, rpath, recursive=recursive, **kwargs) | |
873 | ||
874 | def download(self, rpath, lpath, recursive=False, **kwargs): | |
875 | """Alias of :ref:`FilesystemSpec.get`.""" | |
876 | return self.get(rpath, lpath, recursive=recursive, **kwargs) | |
877 | ||
878 | ||
879 | class AbstractBufferedFile(io.IOBase): | |
880 | """Convenient class to derive from to provide buffering | |
881 | ||
882 | In the case that the backend does not provide a pythonic file-like object | |
883 | already, this class contains much of the logic to build one. The only | |
884 | methods that need to be overridden are ``_upload_chunk``, | |
885 | ``_initate_upload`` and ``_fetch_range``. | |
886 | """ | |
887 | ||
888 | DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 | |
889 | ||
890 | def __init__( | |
891 | self, | |
892 | fs, | |
893 | path, | |
894 | mode="rb", | |
895 | block_size="default", | |
896 | autocommit=True, | |
897 | cache_type="readahead", | |
898 | cache_options=None, | |
899 | **kwargs | |
900 | ): | |
901 | """ | |
902 | Template for files with buffered reading and writing | |
903 | ||
904 | Parameters | |
905 | ---------- | |
906 | fs: instance of FileSystem | |
907 | path: str | |
908 | location in file-system | |
909 | mode: str | |
910 | Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file | |
911 | systems may be read-only, and some may not support append. | |
912 | block_size: int | |
913 | Buffer size for reading or writing, 'default' for class default | |
914 | autocommit: bool | |
915 | Whether to write to final destination; may only impact what | |
916 | happens when file is being closed. | |
917 | cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead" | |
918 | Caching policy in read mode. See the definitions in ``core``. | |
919 | cache_options : dict | |
920 | Additional options passed to the constructor for the cache specified | |
921 | by `cache_type`. | |
922 | kwargs: | |
923 | Gets stored as self.kwargs | |
924 | """ | |
925 | from .core import caches | |
926 | ||
927 | self.path = path | |
928 | self.fs = fs | |
929 | self.mode = mode | |
930 | self.blocksize = ( | |
931 | self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size | |
932 | ) | |
933 | self.loc = 0 | |
934 | self.autocommit = autocommit | |
935 | self.end = None | |
936 | self.start = None | |
937 | self.closed = False | |
938 | ||
939 | if cache_options is None: | |
940 | cache_options = {} | |
941 | ||
942 | if "trim" in kwargs: | |
943 | warnings.warn( | |
944 | "Passing 'trim' to control the cache behavior has been deprecated. " | |
945 | "Specify it within the 'cache_options' argument instead.", | |
946 | FutureWarning, | |
947 | ) | |
948 | cache_options["trim"] = kwargs.pop("trim") | |
949 | ||
950 | self.kwargs = kwargs | |
951 | ||
952 | if mode not in {"ab", "rb", "wb"}: | |
953 | raise NotImplementedError("File mode not supported") | |
954 | if mode == "rb": | |
955 | if not hasattr(self, "details"): | |
956 | self.details = fs.info(path) | |
957 | self.size = self.details["size"] | |
958 | self.cache = caches[cache_type]( | |
959 | self.blocksize, self._fetch_range, self.size, **cache_options | |
960 | ) | |
961 | else: | |
962 | self.buffer = io.BytesIO() | |
963 | self.offset = None | |
964 | self.forced = False | |
965 | self.location = None | |
966 | ||
967 | @property | |
968 | def closed(self): | |
969 | # get around this attr being read-only in IOBase | |
970 | return self._closed | |
971 | ||
972 | @closed.setter | |
973 | def closed(self, c): | |
974 | self._closed = c | |
975 | ||
976 | def __hash__(self): | |
977 | if "w" in self.mode: | |
978 | return id(self) | |
979 | else: | |
980 | return int(tokenize(self.details), 16) | |
981 | ||
982 | def __eq__(self, other): | |
983 | """Files are equal if they have the same checksum, only in read mode""" | |
984 | return self.mode == "rb" and other.mode == "rb" and hash(self) == hash(other) | |
985 | ||
986 | def commit(self): | |
987 | """Move from temp to final destination""" | |
988 | ||
989 | def discard(self): | |
990 | """Throw away temporary file""" | |
991 | ||
992 | def info(self): | |
993 | """ File information about this path """ | |
994 | if "r" in self.mode: | |
995 | return self.details | |
996 | else: | |
997 | raise ValueError("Info not available while writing") | |
998 | ||
999 | def tell(self): | |
1000 | """ Current file location """ | |
1001 | return self.loc | |
1002 | ||
1003 | def seek(self, loc, whence=0): | |
1004 | """ Set current file location | |
1005 | ||
1006 | Parameters | |
1007 | ---------- | |
1008 | loc: int | |
1009 | byte location | |
1010 | whence: {0, 1, 2} | |
1011 | from start of file, current location or end of file, resp. | |
1012 | """ | |
1013 | loc = int(loc) | |
1014 | if not self.mode == "rb": | |
1015 | raise ValueError("Seek only available in read mode") | |
1016 | if whence == 0: | |
1017 | nloc = loc | |
1018 | elif whence == 1: | |
1019 | nloc = self.loc + loc | |
1020 | elif whence == 2: | |
1021 | nloc = self.size + loc | |
1022 | else: | |
1023 | raise ValueError("invalid whence (%s, should be 0, 1 or 2)" % whence) | |
1024 | if nloc < 0: | |
1025 | raise ValueError("Seek before start of file") | |
1026 | self.loc = nloc | |
1027 | return self.loc | |
1028 | ||
1029 | def write(self, data): | |
1030 | """ | |
1031 | Write data to buffer. | |
1032 | ||
1033 | Buffer only sent on flush() or if buffer is greater than | |
1034 | or equal to blocksize. | |
1035 | ||
1036 | Parameters | |
1037 | ---------- | |
1038 | data: bytes | |
1039 | Set of bytes to be written. | |
1040 | """ | |
1041 | if self.mode not in {"wb", "ab"}: | |
1042 | raise ValueError("File not in write mode") | |
1043 | if self.closed: | |
1044 | raise ValueError("I/O operation on closed file.") | |
1045 | if self.forced: | |
1046 | raise ValueError("This file has been force-flushed, can only close") | |
1047 | out = self.buffer.write(data) | |
1048 | self.loc += out | |
1049 | if self.buffer.tell() >= self.blocksize: | |
1050 | self.flush() | |
1051 | return out | |
1052 | ||
1053 | def flush(self, force=False): | |
1054 | """ | |
1055 | Write buffered data to backend store. | |
1056 | ||
1057 | Writes the current buffer, if it is larger than the block-size, or if | |
1058 | the file is being closed. | |
1059 | ||
1060 | Parameters | |
1061 | ---------- | |
1062 | force: bool | |
1063 | When closing, write the last block even if it is smaller than | |
1064 | blocks are allowed to be. Disallows further writing to this file. | |
1065 | """ | |
1066 | ||
1067 | if self.closed: | |
1068 | raise ValueError("Flush on closed file") | |
1069 | if force and self.forced: | |
1070 | raise ValueError("Force flush cannot be called more than once") | |
1071 | if force: | |
1072 | self.forced = True | |
1073 | ||
1074 | if self.mode not in {"wb", "ab"}: | |
1075 | # no-op to flush on read-mode | |
1076 | return | |
1077 | ||
1078 | if not force and self.buffer.tell() < self.blocksize: | |
1079 | # Defer write on small block | |
1080 | return | |
1081 | ||
1082 | if self.offset is None: | |
1083 | # Initialize a multipart upload | |
1084 | self.offset = 0 | |
1085 | self._initiate_upload() | |
1086 | ||
1087 | if self._upload_chunk(final=force) is not False: | |
1088 | self.offset += self.buffer.seek(0, 2) | |
1089 | self.buffer = io.BytesIO() | |
1090 | ||
1091 | def _upload_chunk(self, final=False): | |
1092 | """ Write one part of a multi-block file upload | |
1093 | ||
1094 | Parameters | |
1095 | ========== | |
1096 | final: bool | |
1097 | This is the last block, so should complete file, if | |
1098 | self.autocommit is True. | |
1099 | """ | |
1100 | # may not yet have been initialized, may neet to call _initialize_upload | |
1101 | ||
1102 | def _initiate_upload(self): | |
1103 | """ Create remote file/upload """ | |
1104 | pass | |
1105 | ||
1106 | def _fetch_range(self, start, end): | |
1107 | """Get the specified set of bytes from remote""" | |
1108 | raise NotImplementedError | |
1109 | ||
1110 | def read(self, length=-1): | |
1111 | """ | |
1112 | Return data from cache, or fetch pieces as necessary | |
1113 | ||
1114 | Parameters | |
1115 | ---------- | |
1116 | length: int (-1) | |
1117 | Number of bytes to read; if <0, all remaining bytes. | |
1118 | """ | |
1119 | length = -1 if length is None else int(length) | |
1120 | if self.mode != "rb": | |
1121 | raise ValueError("File not in read mode") | |
1122 | if length < 0: | |
1123 | length = self.size - self.loc | |
1124 | if self.closed: | |
1125 | raise ValueError("I/O operation on closed file.") | |
1126 | logger.debug("%s read: %i - %i" % (self, self.loc, self.loc + length)) | |
1127 | if length == 0: | |
1128 | # don't even bother calling fetch | |
1129 | return b"" | |
1130 | out = self.cache._fetch(self.loc, self.loc + length) | |
1131 | self.loc += len(out) | |
1132 | return out | |
1133 | ||
1134 | def readinto(self, b): | |
1135 | """mirrors builtin file's readinto method | |
1136 | ||
1137 | https://docs.python.org/3/library/io.html#io.RawIOBase.readinto | |
1138 | """ | |
1139 | data = self.read(len(b)) | |
1140 | b[: len(data)] = data | |
1141 | return len(data) | |
1142 | ||
1143 | def readuntil(self, char=b"\n", blocks=None): | |
1144 | """Return data between current position and first occurrence of char | |
1145 | ||
1146 | char is included in the output, except if the end of the tile is | |
1147 | encountered first. | |
1148 | ||
1149 | Parameters | |
1150 | ---------- | |
1151 | char: bytes | |
1152 | Thing to find | |
1153 | blocks: None or int | |
1154 | How much to read in each go. Defaults to file blocksize - which may | |
1155 | mean a new read on every call. | |
1156 | """ | |
1157 | out = [] | |
1158 | while True: | |
1159 | start = self.tell() | |
1160 | part = self.read(blocks or self.blocksize) | |
1161 | if len(part) == 0: | |
1162 | break | |
1163 | found = part.find(char) | |
1164 | if found > -1: | |
1165 | out.append(part[: found + len(char)]) | |
1166 | self.seek(start + found + len(char)) | |
1167 | break | |
1168 | out.append(part) | |
1169 | return b"".join(out) | |
1170 | ||
1171 | def readline(self): | |
1172 | """Read until first occurrence of newline character | |
1173 | ||
1174 | Note that, because of character encoding, this is not necessarily a | |
1175 | true line ending. | |
1176 | """ | |
1177 | return self.readuntil(b"\n") | |
1178 | ||
1179 | def __next__(self): | |
1180 | out = self.readline() | |
1181 | if out: | |
1182 | return out | |
1183 | raise StopIteration | |
1184 | ||
1185 | def __iter__(self): | |
1186 | return self | |
1187 | ||
1188 | def readlines(self): | |
1189 | """Return all data, split by the newline character""" | |
1190 | data = self.read() | |
1191 | lines = data.split(b"\n") | |
1192 | out = [l + b"\n" for l in lines[:-1]] | |
1193 | if data.endswith(b"\n"): | |
1194 | return out | |
1195 | else: | |
1196 | return out + [lines[-1]] | |
1197 | # return list(self) ??? | |
1198 | ||
1199 | def readinto1(self, b): | |
1200 | return self.readinto(b) | |
1201 | ||
1202 | def close(self): | |
1203 | """ Close file | |
1204 | ||
1205 | Finalizes writes, discards cache | |
1206 | """ | |
1207 | if self.closed: | |
1208 | return | |
1209 | if self.mode == "rb": | |
1210 | self.cache = None | |
1211 | else: | |
1212 | if not self.forced: | |
1213 | self.flush(force=True) | |
1214 | ||
1215 | if self.fs is not None: | |
1216 | self.fs.invalidate_cache(self.path) | |
1217 | self.fs.invalidate_cache(self.fs._parent(self.path)) | |
1218 | ||
1219 | self.closed = True | |
1220 | ||
1221 | def readable(self): | |
1222 | """Whether opened for reading""" | |
1223 | return self.mode == "rb" and not self.closed | |
1224 | ||
1225 | def seekable(self): | |
1226 | """Whether is seekable (only in read mode)""" | |
1227 | return self.readable() | |
1228 | ||
1229 | def writable(self): | |
1230 | """Whether opened for writing""" | |
1231 | return self.mode in {"wb", "ab"} and not self.closed | |
1232 | ||
1233 | def __del__(self): | |
1234 | self.close() | |
1235 | ||
1236 | def __str__(self): | |
1237 | return "<File-like object %s, %s>" % (type(self.fs).__name__, self.path) | |
1238 | ||
1239 | __repr__ = __str__ | |
1240 | ||
1241 | def __enter__(self): | |
1242 | return self | |
1243 | ||
1244 | def __exit__(self, *args): | |
1245 | self.close() |
0 | """Tests the spec, using memoryfs""" | |
1 | ||
2 | import os | |
3 | import pickle | |
4 | from fsspec.implementations.memory import MemoryFileSystem, MemoryFile | |
5 | ||
6 | ||
7 | def test_idempotent(): | |
8 | MemoryFileSystem.clear_instance_cache() | |
9 | fs = MemoryFileSystem() | |
10 | fs2 = MemoryFileSystem() | |
11 | assert fs is fs2 | |
12 | assert MemoryFileSystem.current() is fs2 | |
13 | ||
14 | MemoryFileSystem.clear_instance_cache() | |
15 | assert not MemoryFileSystem._cache | |
16 | ||
17 | fs2 = MemoryFileSystem().current() | |
18 | assert fs == fs2 | |
19 | ||
20 | ||
21 | def test_pickle(): | |
22 | fs = MemoryFileSystem() | |
23 | fs2 = pickle.loads(pickle.dumps(fs)) | |
24 | assert fs == fs2 | |
25 | ||
26 | ||
27 | def test_class_methods(): | |
28 | assert MemoryFileSystem._strip_protocol("memory:stuff") == "stuff" | |
29 | assert MemoryFileSystem._strip_protocol("memory://stuff") == "stuff" | |
30 | assert MemoryFileSystem._strip_protocol("stuff") == "stuff" | |
31 | assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff" | |
32 | ||
33 | assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {} | |
34 | ||
35 | ||
36 | def test_get_put(tmpdir): | |
37 | tmpdir = str(tmpdir) | |
38 | fn = os.path.join(tmpdir, "one") | |
39 | open(fn, "wb").write(b"one") | |
40 | os.mkdir(os.path.join(tmpdir, "dir")) | |
41 | fn2 = os.path.join(tmpdir, "dir", "two") | |
42 | open(fn2, "wb").write(b"two") | |
43 | ||
44 | fs = MemoryFileSystem() | |
45 | fs.put(fn, "/afile") | |
46 | assert fs.cat("/afile") == b"one" | |
47 | ||
48 | fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data") | |
49 | fn3 = os.path.join(tmpdir, "three") | |
50 | fs.get("/bfile", fn3) | |
51 | assert open(fn3, "rb").read() == b"data" | |
52 | ||
53 | fs.put(tmpdir, "/more", recursive=True) | |
54 | assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"] | |
55 | ||
56 | for f in [fn, fn2, fn3]: | |
57 | os.remove(f) | |
58 | os.rmdir(os.path.join(tmpdir, "dir")) | |
59 | ||
60 | fs.get("/more/", tmpdir + "/", recursive=True) | |
61 | assert open(fn3, "rb").read() == b"data" | |
62 | assert open(fn, "rb").read() == b"one" | |
63 | ||
64 | ||
65 | def test_du(): | |
66 | fs = MemoryFileSystem() | |
67 | fs.store = { | |
68 | "/dir/afile": MemoryFile(fs, "/afile", b"a"), | |
69 | "/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"), | |
70 | "/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"), | |
71 | } | |
72 | assert fs.du("/dir") == 6 | |
73 | assert fs.du("/dir", total=False)["/dir/dirb/afile"] == 2 | |
74 | assert fs.du("/dir", maxdepth=0) == 1 | |
75 | ||
76 | ||
77 | def test_head_tail(): | |
78 | fs = MemoryFileSystem() | |
79 | with fs.open("/myfile", "wb") as f: | |
80 | f.write(b"I had a nice big cabbage") | |
81 | assert fs.head("/myfile", 5) == b"I had" | |
82 | assert fs.tail("/myfile", 7) == b"cabbage" | |
83 | ||
84 | ||
85 | def test_move(): | |
86 | fs = MemoryFileSystem() | |
87 | with fs.open("/myfile", "wb") as f: | |
88 | f.write(b"I had a nice big cabbage") | |
89 | fs.move("/myfile", "/otherfile") | |
90 | assert not fs.exists("/myfile") | |
91 | assert fs.info("/otherfile") | |
92 | assert isinstance(fs.ukey("/otherfile"), str) | |
93 | ||
94 | ||
95 | def test_read_block_delimiter(): | |
96 | fs = MemoryFileSystem() | |
97 | with fs.open("/myfile", "wb") as f: | |
98 | f.write(b"some\n" b"lines\n" b"of\n" b"text") | |
99 | assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n" | |
100 | assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n" | |
101 | assert fs.read_block("/myfile", 6, 2, b"\n") == b"" | |
102 | assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n" | |
103 | assert fs.read_block("/myfile", 12, 6, b"\n") == b"text" | |
104 | assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile") | |
105 | ||
106 | ||
107 | def test_open_text(): | |
108 | fs = MemoryFileSystem() | |
109 | with fs.open("/myfile", "wb") as f: | |
110 | f.write(b"some\n" b"lines\n" b"of\n" b"text") | |
111 | f = fs.open("/myfile", "r", encoding="latin1") | |
112 | assert f.encoding == "latin1" |
0 | import pathlib | |
1 | ||
2 | import pytest | |
3 | ||
4 | import fsspec.core | |
5 | from fsspec.compression import compr, register_compression | |
6 | from fsspec.utils import compressions, infer_compression | |
7 | ||
8 | ||
9 | def test_infer_custom_compression(): | |
10 | """Inferred compression gets values from fsspec.compression.compr.""" | |
11 | assert infer_compression("fn.zip") == "zip" | |
12 | assert infer_compression("fn.gz") == "gzip" | |
13 | assert infer_compression("fn.unknown") is None | |
14 | assert infer_compression("fn.test_custom") is None | |
15 | assert infer_compression("fn.tst") is None | |
16 | ||
17 | register_compression("test_custom", lambda f, **kwargs: f, "tst") | |
18 | ||
19 | try: | |
20 | assert infer_compression("fn.zip") == "zip" | |
21 | assert infer_compression("fn.gz") == "gzip" | |
22 | assert infer_compression("fn.unknown") is None | |
23 | assert infer_compression("fn.test_custom") is None | |
24 | assert infer_compression("fn.tst") == "test_custom" | |
25 | ||
26 | # Duplicate registration in name or extension raises a value error. | |
27 | with pytest.raises(ValueError): | |
28 | register_compression("test_custom", lambda f, **kwargs: f, "tst") | |
29 | ||
30 | with pytest.raises(ValueError): | |
31 | register_compression("test_conflicting", lambda f, **kwargs: f, "tst") | |
32 | assert "test_conflicting" not in compr | |
33 | ||
34 | # ...but can be forced. | |
35 | register_compression( | |
36 | "test_conflicting", lambda f, **kwargs: f, "tst", force=True | |
37 | ) | |
38 | assert infer_compression("fn.zip") == "zip" | |
39 | assert infer_compression("fn.gz") == "gzip" | |
40 | assert infer_compression("fn.unknown") is None | |
41 | assert infer_compression("fn.test_custom") is None | |
42 | assert infer_compression("fn.tst") == "test_conflicting" | |
43 | ||
44 | finally: | |
45 | del compr["test_custom"] | |
46 | del compr["test_conflicting"] | |
47 | del compressions["tst"] | |
48 | ||
49 | ||
50 | def test_lzma_compression_name(): | |
51 | pytest.importorskip("lzma") | |
52 | assert infer_compression("fn.xz") == "xz" | |
53 | ||
54 | ||
55 | def test_lz4_compression(tmpdir): | |
56 | """Infer lz4 compression for .lz4 files if lz4 is available.""" | |
57 | tmp_path = pathlib.Path(str(tmpdir)) | |
58 | ||
59 | lz4 = pytest.importorskip("lz4") | |
60 | ||
61 | tmp_path.mkdir(exist_ok=True) | |
62 | ||
63 | tdat = "foobar" * 100 | |
64 | ||
65 | with fsspec.core.open( | |
66 | str(tmp_path / "out.lz4"), mode="wt", compression="infer" | |
67 | ) as outfile: | |
68 | outfile.write(tdat) | |
69 | ||
70 | compressed = (tmp_path / "out.lz4").open("rb").read() | |
71 | assert lz4.frame.decompress(compressed).decode() == tdat | |
72 | ||
73 | with fsspec.core.open( | |
74 | str(tmp_path / "out.lz4"), mode="rt", compression="infer" | |
75 | ) as infile: | |
76 | assert infile.read() == tdat | |
77 | ||
78 | with fsspec.core.open( | |
79 | str(tmp_path / "out.lz4"), mode="rt", compression="lz4" | |
80 | ) as infile: | |
81 | assert infile.read() == tdat | |
82 | ||
83 | ||
84 | def test_zstd_compression(tmpdir): | |
85 | """Infer zstd compression for .zst files if zstandard is available.""" | |
86 | tmp_path = pathlib.Path(str(tmpdir)) | |
87 | ||
88 | zstd = pytest.importorskip("zstandard") | |
89 | ||
90 | tmp_path.mkdir(exist_ok=True) | |
91 | ||
92 | tdat = "foobar" * 100 | |
93 | ||
94 | with fsspec.core.open( | |
95 | str(tmp_path / "out.zst"), mode="wt", compression="infer" | |
96 | ) as outfile: | |
97 | outfile.write(tdat) | |
98 | ||
99 | compressed = (tmp_path / "out.zst").open("rb").read() | |
100 | assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat | |
101 | ||
102 | with fsspec.core.open( | |
103 | str(tmp_path / "out.zst"), mode="rt", compression="infer" | |
104 | ) as infile: | |
105 | assert infile.read() == tdat | |
106 | ||
107 | with fsspec.core.open( | |
108 | str(tmp_path / "out.zst"), mode="rt", compression="zstd" | |
109 | ) as infile: | |
110 | assert infile.read() == tdat | |
111 | ||
112 | ||
113 | def test_snappy_compression(tmpdir): | |
114 | """No registered compression for snappy, but can be specified.""" | |
115 | tmp_path = pathlib.Path(str(tmpdir)) | |
116 | ||
117 | snappy = pytest.importorskip("snappy") | |
118 | ||
119 | tmp_path.mkdir(exist_ok=True) | |
120 | ||
121 | tdat = "foobar" * 100 | |
122 | ||
123 | # Snappy isn't inferred. | |
124 | with fsspec.core.open( | |
125 | str(tmp_path / "out.snappy"), mode="wt", compression="infer" | |
126 | ) as outfile: | |
127 | outfile.write(tdat) | |
128 | assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat | |
129 | ||
130 | # but can be specified. | |
131 | with fsspec.core.open( | |
132 | str(tmp_path / "out.snappy"), mode="wt", compression="snappy" | |
133 | ) as outfile: | |
134 | outfile.write(tdat) | |
135 | ||
136 | compressed = (tmp_path / "out.snappy").open("rb").read() | |
137 | assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat | |
138 | ||
139 | with fsspec.core.open( | |
140 | str(tmp_path / "out.snappy"), mode="rb", compression="infer" | |
141 | ) as infile: | |
142 | assert infile.read() == compressed | |
143 | ||
144 | with fsspec.core.open( | |
145 | str(tmp_path / "out.snappy"), mode="rt", compression="snappy" | |
146 | ) as infile: | |
147 | assert infile.read() == tdat |
0 | import pytest | |
1 | import pickle | |
2 | import string | |
3 | ||
4 | from fsspec.core import ( | |
5 | _expand_paths, | |
6 | OpenFile, | |
7 | caches, | |
8 | get_compression, | |
9 | BaseCache, | |
10 | BlockCache, | |
11 | ) | |
12 | ||
13 | ||
14 | @pytest.mark.parametrize( | |
15 | "path, name_function, num, out", | |
16 | [ | |
17 | [["apath"], None, 1, ["apath"]], | |
18 | ["apath.*.csv", None, 1, ["apath.0.csv"]], | |
19 | ["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]], | |
20 | ["a*", lambda x: "abc"[x], 2, ["aa", "ab"]], | |
21 | ], | |
22 | ) | |
23 | def test_expand_paths(path, name_function, num, out): | |
24 | assert _expand_paths(path, name_function, num) == out | |
25 | ||
26 | ||
27 | def test_expand_error(): | |
28 | with pytest.raises(ValueError): | |
29 | _expand_paths("*.*", None, 1) | |
30 | ||
31 | ||
32 | def test_openfile_api(m): | |
33 | m.open("somepath", "wb").write(b"data") | |
34 | of = OpenFile(m, "somepath") | |
35 | assert str(of) == "<OpenFile 'somepath'>" | |
36 | f = of.open() | |
37 | assert f.read() == b"data" | |
38 | f.close() | |
39 | with OpenFile(m, "somepath", mode="rt") as f: | |
40 | f.read() == "data" | |
41 | ||
42 | ||
43 | # For test_cache_pickleable(). Functions are only picklable if they are defined | |
44 | # at the top-level of a module | |
45 | def _fetcher(start, end): | |
46 | return b"0" * (end - start) | |
47 | ||
48 | ||
49 | def letters_fetcher(start, end): | |
50 | return string.ascii_letters[start:end].encode() | |
51 | ||
52 | ||
53 | @pytest.fixture(params=caches.values(), ids=list(caches.keys())) | |
54 | def Cache_imp(request): | |
55 | return request.param | |
56 | ||
57 | ||
58 | def test_cache_empty_file(Cache_imp): | |
59 | blocksize = 5 | |
60 | size = 0 | |
61 | cache = Cache_imp(blocksize, _fetcher, size) | |
62 | assert cache._fetch(0, 0) == b"" | |
63 | ||
64 | ||
65 | def test_cache_pickleable(Cache_imp): | |
66 | blocksize = 5 | |
67 | size = 100 | |
68 | cache = Cache_imp(blocksize, _fetcher, size) | |
69 | cache._fetch(0, 5) # fill in cache | |
70 | unpickled = pickle.loads(pickle.dumps(cache)) | |
71 | assert isinstance(unpickled, Cache_imp) | |
72 | assert unpickled.blocksize == blocksize | |
73 | assert unpickled.size == size | |
74 | assert unpickled._fetch(0, 10) == b"0" * 10 | |
75 | ||
76 | ||
77 | @pytest.mark.parametrize( | |
78 | "size_requests", | |
79 | [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]], | |
80 | ) | |
81 | @pytest.mark.parametrize("blocksize", [1, 10, 52, 100]) | |
82 | def test_cache_basic(Cache_imp, blocksize, size_requests): | |
83 | cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters)) | |
84 | ||
85 | for start, end in size_requests: | |
86 | result = cache[start:end] | |
87 | expected = string.ascii_letters[start:end].encode() | |
88 | assert result == expected | |
89 | ||
90 | ||
91 | def test_xz_lzma_compressions(): | |
92 | pytest.importorskip("lzma") | |
93 | # Ensure that both 'xz' and 'lzma' compression names can be parsed | |
94 | assert get_compression("some_file.xz", "infer") == "xz" | |
95 | assert get_compression("some_file.xz", "xz") == "xz" | |
96 | assert get_compression("some_file.xz", "lzma") == "lzma" | |
97 | ||
98 | ||
99 | def test_cache_getitem(Cache_imp): | |
100 | cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters)) | |
101 | assert cacher[0:4] == b"abcd" | |
102 | assert cacher[:4] == b"abcd" | |
103 | assert cacher[-3:] == b"XYZ" | |
104 | assert cacher[-3:-1] == b"XY" | |
105 | assert cacher[2:4] == b"cd" | |
106 | ||
107 | ||
108 | def test_cache_getitem_raises(): | |
109 | cacher = BaseCache(4, letters_fetcher, len(string.ascii_letters)) | |
110 | with pytest.raises(TypeError, match="int"): | |
111 | cacher[5] | |
112 | ||
113 | with pytest.raises(ValueError, match="contiguous"): | |
114 | cacher[::4] | |
115 | ||
116 | ||
117 | def test_block_cache_lru(): | |
118 | cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2) | |
119 | # miss | |
120 | cache[0:2] | |
121 | assert cache.cache_info().hits == 0 | |
122 | assert cache.cache_info().misses == 1 | |
123 | assert cache.cache_info().currsize == 1 | |
124 | ||
125 | # hit | |
126 | cache[0:2] | |
127 | assert cache.cache_info().hits == 1 | |
128 | assert cache.cache_info().misses == 1 | |
129 | assert cache.cache_info().currsize == 1 | |
130 | ||
131 | # miss | |
132 | cache[4:6] | |
133 | assert cache.cache_info().hits == 1 | |
134 | assert cache.cache_info().misses == 2 | |
135 | assert cache.cache_info().currsize == 2 | |
136 | ||
137 | # miss & evict | |
138 | cache[12:13] | |
139 | assert cache.cache_info().hits == 1 | |
140 | assert cache.cache_info().misses == 3 | |
141 | assert cache.cache_info().currsize == 2 |
0 | """Tests abstract buffered file API, using FTP implementation""" | |
1 | import pickle | |
2 | import sys | |
3 | import pytest | |
4 | from fsspec.implementations.tests.test_ftp import FTPFileSystem | |
5 | ||
6 | data = b"hello" * 10000 | |
7 | ||
8 | ||
9 | @pytest.mark.xfail( | |
10 | sys.version_info < (3, 6), | |
11 | reason="py35 error, see https://github.com/intake/filesystem_spec/issues/147", | |
12 | ) | |
13 | def test_pickle(ftp_writable): | |
14 | host, port, user, pw = ftp_writable | |
15 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
16 | ||
17 | f = ftp.open("/out", "rb") | |
18 | ||
19 | f2 = pickle.loads(pickle.dumps(f)) | |
20 | assert f == f2 | |
21 | ||
22 | ||
23 | def test_file_read_attributes(ftp_writable): | |
24 | host, port, user, pw = ftp_writable | |
25 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
26 | ||
27 | f = ftp.open("/out", "rb") | |
28 | assert f.info()["size"] == len(data) | |
29 | assert f.tell() == 0 | |
30 | assert f.seekable() | |
31 | assert f.readable() | |
32 | assert not f.writable() | |
33 | out = bytearray(len(data)) | |
34 | ||
35 | assert f.read() == data | |
36 | assert f.read() == b"" | |
37 | f.seek(0) | |
38 | assert f.readuntil(b"l") == b"hel" | |
39 | assert f.tell() == 3 | |
40 | ||
41 | f.readinto1(out) | |
42 | assert out[:-3] == data[3:] | |
43 | with pytest.raises(ValueError): | |
44 | f.write(b"") | |
45 | f.close() | |
46 | with pytest.raises(ValueError): | |
47 | f.read()(b"") | |
48 | ||
49 | ||
50 | def test_seek(ftp_writable): | |
51 | host, port, user, pw = ftp_writable | |
52 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
53 | ||
54 | f = ftp.open("/out", "rb") | |
55 | ||
56 | assert f.seek(-10, 2) == len(data) - 10 | |
57 | assert f.tell() == len(data) - 10 | |
58 | assert f.seek(-1, 1) == len(data) - 11 | |
59 | with pytest.raises(ValueError): | |
60 | f.seek(-1) | |
61 | with pytest.raises(ValueError): | |
62 | f.seek(0, 7) | |
63 | ||
64 | ||
65 | def test_file_idempotent(ftp_writable): | |
66 | host, port, user, pw = ftp_writable | |
67 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
68 | ||
69 | f = ftp.open("/out", "rb") | |
70 | f2 = ftp.open("/out", "rb") | |
71 | assert hash(f) == hash(f2) | |
72 | assert f == f2 | |
73 | ftp.touch("/out2") | |
74 | f2 = ftp.open("/out2", "rb") | |
75 | assert hash(f2) != hash(f) | |
76 | assert f != f2 | |
77 | f2 = ftp.open("/out", "wb") | |
78 | assert hash(f2) != hash(f) | |
79 | ||
80 | ||
81 | def test_file_text_attributes(ftp_writable): | |
82 | host, port, user, pw = ftp_writable | |
83 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
84 | ||
85 | data = b"hello\n" * 1000 | |
86 | with ftp.open("/out2", "wb") as f: | |
87 | f.write(data) | |
88 | ||
89 | f = ftp.open("/out2", "rb") | |
90 | assert f.readline() == b"hello\n" | |
91 | f.seek(0) | |
92 | assert list(f) == [d + b"\n" for d in data.split()] | |
93 | f.seek(0) | |
94 | assert f.readlines() == [d + b"\n" for d in data.split()] | |
95 | ||
96 | f = ftp.open("/out2", "rt") | |
97 | assert f.readline() == "hello\n" | |
98 | assert f.encoding | |
99 | ||
100 | ||
101 | def test_file_write_attributes(ftp_writable): | |
102 | host, port, user, pw = ftp_writable | |
103 | ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
104 | f = ftp.open("/out2", "wb") | |
105 | with pytest.raises(ValueError): | |
106 | f.info() | |
107 | with pytest.raises(ValueError): | |
108 | f.seek(0) | |
109 | with pytest.raises(ValueError): | |
110 | f.read(0) | |
111 | assert not f.readable() | |
112 | assert f.writable() | |
113 | ||
114 | f.flush() # no-op | |
115 | ||
116 | assert f.write(b"hello") == 5 | |
117 | assert f.write(b"hello") == 5 | |
118 | assert not f.closed | |
119 | f.close() | |
120 | assert f.closed | |
121 | with pytest.raises(ValueError): | |
122 | f.write(b"") | |
123 | with pytest.raises(ValueError): | |
124 | f.flush() | |
125 | ||
126 | ||
127 | def test_midread_cache(ftp_writable): | |
128 | host, port, user, pw = ftp_writable | |
129 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
130 | fn = "/myfile" | |
131 | with fs.open(fn, "wb") as f: | |
132 | f.write(b"a" * 175627146) | |
133 | with fs.open(fn, "rb") as f: | |
134 | f.seek(175561610) | |
135 | d1 = f.read(65536) | |
136 | assert len(d1) == 65536 | |
137 | ||
138 | f.seek(4) | |
139 | size = 17562198 | |
140 | d2 = f.read(size) | |
141 | assert len(d2) == size | |
142 | ||
143 | f.seek(17562288) | |
144 | size = 17562187 | |
145 | d3 = f.read(size) | |
146 | assert len(d3) == size | |
147 | ||
148 | ||
149 | def test_read_block(ftp_writable): | |
150 | # not the same as test_read_block in test_utils, this depends on the | |
151 | # behaviour of the bytest caching | |
152 | from fsspec.utils import read_block | |
153 | ||
154 | host, port, user, pw = ftp_writable | |
155 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
156 | fn = "/myfile" | |
157 | with fs.open(fn, "wb") as f: | |
158 | f.write(b"a,b\n1,2") | |
159 | f = fs.open(fn, "rb", cache_type="bytes") | |
160 | assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2" | |
161 | ||
162 | ||
163 | def test_with_gzip(ftp_writable): | |
164 | import gzip | |
165 | ||
166 | data = b"some compressable stuff" | |
167 | host, port, user, pw = ftp_writable | |
168 | fs = FTPFileSystem(host=host, port=port, username=user, password=pw) | |
169 | fn = "/myfile" | |
170 | with fs.open(fn, "wb") as f: | |
171 | gf = gzip.GzipFile(fileobj=f, mode="w") | |
172 | gf.write(data) | |
173 | gf.close() | |
174 | with fs.open(fn, "rb") as f: | |
175 | gf = gzip.GzipFile(fileobj=f, mode="r") | |
176 | assert gf.read() == data |
0 | import os | |
1 | import signal | |
2 | import time | |
3 | from multiprocessing import Process | |
4 | ||
5 | import pytest | |
6 | ||
7 | pytest.importorskip("fuse") # noqa: E402 | |
8 | ||
9 | from fsspec.fuse import run | |
10 | from fsspec.implementations.memory import MemoryFileSystem | |
11 | ||
12 | ||
13 | def host_fuse(mountdir): | |
14 | fs = MemoryFileSystem() | |
15 | fs.touch("/mounted/testfile") | |
16 | run(fs, "/mounted/", mountdir) | |
17 | ||
18 | ||
19 | def test_basic(tmpdir): | |
20 | mountdir = str(tmpdir.mkdir("mount")) | |
21 | ||
22 | fuse_process = Process(target=host_fuse, args=(str(mountdir),)) | |
23 | fuse_process.start() | |
24 | ||
25 | try: | |
26 | timeout = 10 | |
27 | while True: | |
28 | try: | |
29 | # can fail with device not ready while waiting for fuse | |
30 | if "testfile" in os.listdir(mountdir): | |
31 | break | |
32 | except Exception: | |
33 | pass | |
34 | timeout -= 1 | |
35 | time.sleep(1) | |
36 | assert timeout > 0, "Timeout" | |
37 | ||
38 | fn = os.path.join(mountdir, "test") | |
39 | with open(fn, "wb") as f: | |
40 | f.write(b"data") | |
41 | ||
42 | with open(fn) as f: | |
43 | assert f.read() == "data" | |
44 | ||
45 | os.remove(fn) | |
46 | ||
47 | os.mkdir(fn) | |
48 | assert os.listdir(fn) == [] | |
49 | ||
50 | os.mkdir(fn + "/inner") | |
51 | ||
52 | with pytest.raises(OSError): | |
53 | os.rmdir(fn) | |
54 | ||
55 | os.rmdir(fn + "/inner") | |
56 | os.rmdir(fn) | |
57 | finally: | |
58 | os.kill(fuse_process.pid, signal.SIGTERM) | |
59 | fuse_process.join() |
0 | import os | |
1 | import fsspec | |
2 | from fsspec.implementations.memory import MemoryFileSystem | |
3 | import pickle | |
4 | import pytest | |
5 | ||
6 | ||
7 | def test_mapping_prefix(tmpdir): | |
8 | tmpdir = str(tmpdir) | |
9 | os.makedirs(os.path.join(tmpdir, "afolder")) | |
10 | open(os.path.join(tmpdir, "afile"), "w").write("test") | |
11 | open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") | |
12 | ||
13 | m = fsspec.get_mapper("file://" + tmpdir) | |
14 | assert "afile" in m | |
15 | assert m["afolder/anotherfile"] == b"test2" | |
16 | ||
17 | fs = fsspec.filesystem("file") | |
18 | m2 = fs.get_mapper(tmpdir) | |
19 | m3 = fs.get_mapper("file://" + tmpdir) | |
20 | ||
21 | assert m == m2 == m3 | |
22 | ||
23 | ||
24 | def test_ops(): | |
25 | MemoryFileSystem.store.clear() | |
26 | m = fsspec.get_mapper("memory://") | |
27 | assert not m | |
28 | assert list(m) == [] | |
29 | ||
30 | with pytest.raises(KeyError): | |
31 | m["hi"] | |
32 | ||
33 | assert m.pop("key", 0) == 0 | |
34 | ||
35 | m["key0"] = b"data" | |
36 | assert list(m) == ["key0"] | |
37 | assert m["key0"] == b"data" | |
38 | ||
39 | m.clear() | |
40 | ||
41 | assert list(m) == [] | |
42 | ||
43 | ||
44 | def test_pickle(): | |
45 | m = fsspec.get_mapper("memory://") | |
46 | assert isinstance(m.fs, MemoryFileSystem) | |
47 | m["key"] = b"data" | |
48 | m2 = pickle.loads(pickle.dumps(m)) | |
49 | assert list(m) == list(m2) | |
50 | ||
51 | ||
52 | def test_keys_view(): | |
53 | # https://github.com/intake/filesystem_spec/issues/186 | |
54 | m = fsspec.get_mapper("memory://") | |
55 | m["key"] = b"data" | |
56 | ||
57 | keys = m.keys() | |
58 | assert len(keys) == 1 | |
59 | # check that we don't consume the keys | |
60 | assert len(keys) == 1 |
0 | import pytest | |
1 | from fsspec.registry import get_filesystem_class, registry | |
2 | ||
3 | ||
4 | @pytest.mark.parametrize( | |
5 | "protocol,module,minversion,oldversion", | |
6 | [("s3", "s3fs", "0.3.0", "0.1.0"), ("gs", "gcsfs", "0.3.0", "0.1.0")], | |
7 | ) | |
8 | def test_minversion_s3fs(protocol, module, minversion, oldversion, monkeypatch): | |
9 | registry.clear() | |
10 | mod = pytest.importorskip(module, minversion) | |
11 | ||
12 | assert get_filesystem_class("s3") is not None | |
13 | registry.clear() | |
14 | ||
15 | monkeypatch.setattr(mod, "__version__", oldversion) | |
16 | with pytest.raises(RuntimeError, match=minversion): | |
17 | get_filesystem_class(protocol) |
0 | import pytest | |
1 | from fsspec.spec import AbstractFileSystem, AbstractBufferedFile | |
2 | ||
3 | ||
4 | class DummyTestFS(AbstractFileSystem): | |
5 | protocol = "mock" | |
6 | _fs_contents = ( | |
7 | {"name": "top_level/second_level/date=2019-10-01/", "type": "directory"}, | |
8 | { | |
9 | "name": "top_level/second_level/date=2019-10-01/a.parquet", | |
10 | "type": "file", | |
11 | "size": 100, | |
12 | }, | |
13 | { | |
14 | "name": "top_level/second_level/date=2019-10-01/b.parquet", | |
15 | "type": "file", | |
16 | "size": 100, | |
17 | }, | |
18 | {"name": "top_level/second_level/date=2019-10-02/", "type": "directory"}, | |
19 | { | |
20 | "name": "top_level/second_level/date=2019-10-02/a.parquet", | |
21 | "type": "file", | |
22 | "size": 100, | |
23 | }, | |
24 | {"name": "top_level/second_level/date=2019-10-04/", "type": "directory"}, | |
25 | { | |
26 | "name": "top_level/second_level/date=2019-10-04/a.parquet", | |
27 | "type": "file", | |
28 | "size": 100, | |
29 | }, | |
30 | {"name": "misc/", "type": "directory"}, | |
31 | {"name": "misc/foo.txt", "type": "file", "size": 100}, | |
32 | ) | |
33 | ||
34 | def ls(self, path, detail=True, **kwargs): | |
35 | files = (file for file in self._fs_contents if path in file["name"]) | |
36 | ||
37 | if detail: | |
38 | return list(files) | |
39 | ||
40 | return list(sorted([file["name"] for file in files])) | |
41 | ||
42 | ||
43 | @pytest.mark.parametrize( | |
44 | "test_path, expected", | |
45 | [ | |
46 | ( | |
47 | "mock://top_level/second_level/date=2019-10-01/a.parquet", | |
48 | ["top_level/second_level/date=2019-10-01/a.parquet"], | |
49 | ), | |
50 | ( | |
51 | "mock://top_level/second_level/date=2019-10-01/*", | |
52 | [ | |
53 | "top_level/second_level/date=2019-10-01/a.parquet", | |
54 | "top_level/second_level/date=2019-10-01/b.parquet", | |
55 | ], | |
56 | ), | |
57 | ( | |
58 | "mock://top_level/second_level/date=2019-10-0[1-4]", | |
59 | [ | |
60 | "top_level/second_level/date=2019-10-01", | |
61 | "top_level/second_level/date=2019-10-02", | |
62 | "top_level/second_level/date=2019-10-04", | |
63 | ], | |
64 | ), | |
65 | ( | |
66 | "mock://top_level/second_level/date=2019-10-0[1-4]/*", | |
67 | [ | |
68 | "top_level/second_level/date=2019-10-01/a.parquet", | |
69 | "top_level/second_level/date=2019-10-01/b.parquet", | |
70 | "top_level/second_level/date=2019-10-02/a.parquet", | |
71 | "top_level/second_level/date=2019-10-04/a.parquet", | |
72 | ], | |
73 | ), | |
74 | ( | |
75 | "mock://top_level/second_level/date=2019-10-0[1-4]/[a].*", | |
76 | [ | |
77 | "top_level/second_level/date=2019-10-01/a.parquet", | |
78 | "top_level/second_level/date=2019-10-02/a.parquet", | |
79 | "top_level/second_level/date=2019-10-04/a.parquet", | |
80 | ], | |
81 | ), | |
82 | ], | |
83 | ) | |
84 | def test_glob(test_path, expected): | |
85 | test_fs = DummyTestFS() | |
86 | ||
87 | assert test_fs.glob(test_path) == expected | |
88 | ||
89 | ||
90 | def test_cache(): | |
91 | fs = DummyTestFS() | |
92 | fs2 = DummyTestFS() | |
93 | assert fs is fs2 | |
94 | ||
95 | assert len(fs._cache) == 1 | |
96 | del fs2 | |
97 | assert len(fs._cache) == 1 | |
98 | del fs | |
99 | assert len(DummyTestFS._cache) == 1 | |
100 | ||
101 | DummyTestFS.clear_instance_cache() | |
102 | assert len(DummyTestFS._cache) == 0 | |
103 | ||
104 | ||
105 | def test_alias(): | |
106 | with pytest.warns(FutureWarning, match="add_aliases"): | |
107 | DummyTestFS(add_aliases=True) | |
108 | ||
109 | ||
110 | def test_add_docs_warns(): | |
111 | with pytest.warns(FutureWarning, match="add_docs"): | |
112 | AbstractFileSystem(add_docs=True) | |
113 | ||
114 | ||
115 | def test_cache_options(): | |
116 | fs = DummyTestFS() | |
117 | f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes") | |
118 | assert f.cache.trim | |
119 | ||
120 | # TODO: dummy buffered file | |
121 | f = AbstractBufferedFile( | |
122 | fs, "misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False) | |
123 | ) | |
124 | assert f.cache.trim is False | |
125 | ||
126 | f = fs.open("misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False)) | |
127 | assert f.cache.trim is False | |
128 | ||
129 | ||
130 | def test_trim_kwarg_warns(): | |
131 | fs = DummyTestFS() | |
132 | with pytest.warns(FutureWarning, match="cache_options"): | |
133 | AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", trim=False) | |
134 | ||
135 | ||
136 | def test_eq(): | |
137 | fs = DummyTestFS() | |
138 | result = fs == 1 | |
139 | assert result is False |
0 | import io | |
1 | import pytest | |
2 | from fsspec.utils import infer_storage_options, seek_delimiter, read_block | |
3 | ||
4 | ||
5 | def test_read_block(): | |
6 | delimiter = b"\n" | |
7 | data = delimiter.join([b"123", b"456", b"789"]) | |
8 | f = io.BytesIO(data) | |
9 | ||
10 | assert read_block(f, 1, 2) == b"23" | |
11 | assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n" | |
12 | assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n" | |
13 | assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n" | |
14 | assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n" | |
15 | assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789" | |
16 | assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789" | |
17 | assert read_block(f, 1, 1, delimiter=b"\n") == b"" | |
18 | assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n" | |
19 | assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789" | |
20 | ||
21 | for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]: | |
22 | out = [read_block(f, o, l, b"\n") for o, l in ols] | |
23 | assert b"".join(filter(None, out)) == data | |
24 | ||
25 | ||
26 | def test_read_block_split_before(): | |
27 | """Test start/middle/end cases of split_before.""" # noqa: I | |
28 | d = ( | |
29 | "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000)) | |
30 | ).encode() | |
31 | ||
32 | # Read single record at beginning. | |
33 | # All reads include beginning of file and read through termination of | |
34 | # delimited record. | |
35 | assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n" | |
36 | assert ( | |
37 | read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True) | |
38 | == b"#header>foo0" | |
39 | ) | |
40 | assert ( | |
41 | read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>" | |
42 | ) | |
43 | assert ( | |
44 | read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True) | |
45 | == b"#header>foo0\nFOOBAR0\n" | |
46 | ) | |
47 | ||
48 | # Read multiple records at beginning. | |
49 | # All reads include beginning of file and read through termination of | |
50 | # delimited record. | |
51 | assert ( | |
52 | read_block(io.BytesIO(d), 0, 27, delimiter=b"\n") | |
53 | == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" | |
54 | ) | |
55 | assert ( | |
56 | read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True) | |
57 | == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1" | |
58 | ) | |
59 | assert ( | |
60 | read_block(io.BytesIO(d), 0, 27, delimiter=b">") | |
61 | == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>" | |
62 | ) | |
63 | assert ( | |
64 | read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True) | |
65 | == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" | |
66 | ) | |
67 | ||
68 | # Read with offset spanning into next record, splits on either side of delimiter. | |
69 | # Read not spanning the full record returns nothing. | |
70 | assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n" | |
71 | assert ( | |
72 | read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True) | |
73 | == b"\nFOOBAR0" | |
74 | ) | |
75 | assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b"" | |
76 | assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b"" | |
77 | ||
78 | # Read with offset spanning multiple records, splits on either side of delimiter | |
79 | assert ( | |
80 | read_block(io.BytesIO(d), 10, 20, delimiter=b"\n") | |
81 | == b"FOOBAR0\n>foo1\nFOOBAR1\n" | |
82 | ) | |
83 | assert ( | |
84 | read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True) | |
85 | == b"\nFOOBAR0\n>foo1\nFOOBAR1" | |
86 | ) | |
87 | assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>" | |
88 | assert ( | |
89 | read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True) | |
90 | == b">foo1\nFOOBAR1\n" | |
91 | ) | |
92 | ||
93 | # Read record at end, all records read to end | |
94 | ||
95 | tlen = len(d) | |
96 | ||
97 | assert ( | |
98 | read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n") | |
99 | == b">foo99999\nFOOBAR99999\n" | |
100 | ) | |
101 | ||
102 | assert ( | |
103 | read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True) | |
104 | == b"\n>foo99999\nFOOBAR99999\n" | |
105 | ) | |
106 | ||
107 | assert ( | |
108 | read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">") | |
109 | == b"foo99999\nFOOBAR99999\n" | |
110 | ) | |
111 | ||
112 | assert ( | |
113 | read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True) | |
114 | == b">foo99999\nFOOBAR99999\n" | |
115 | ) | |
116 | ||
117 | ||
118 | def test_seek_delimiter_endline(): | |
119 | f = io.BytesIO(b"123\n456\n789") | |
120 | ||
121 | # if at zero, stay at zero | |
122 | seek_delimiter(f, b"\n", 5) | |
123 | assert f.tell() == 0 | |
124 | ||
125 | # choose the first block | |
126 | for bs in [1, 5, 100]: | |
127 | f.seek(1) | |
128 | seek_delimiter(f, b"\n", blocksize=bs) | |
129 | assert f.tell() == 4 | |
130 | ||
131 | # handle long delimiters well, even with short blocksizes | |
132 | f = io.BytesIO(b"123abc456abc789") | |
133 | for bs in [1, 2, 3, 4, 5, 6, 10]: | |
134 | f.seek(1) | |
135 | seek_delimiter(f, b"abc", blocksize=bs) | |
136 | assert f.tell() == 6 | |
137 | ||
138 | # End at the end | |
139 | f = io.BytesIO(b"123\n456") | |
140 | f.seek(5) | |
141 | seek_delimiter(f, b"\n", 5) | |
142 | assert f.tell() == 7 | |
143 | ||
144 | ||
145 | def test_infer_options(): | |
146 | so = infer_storage_options("/mnt/datasets/test.csv") | |
147 | assert so.pop("protocol") == "file" | |
148 | assert so.pop("path") == "/mnt/datasets/test.csv" | |
149 | assert not so | |
150 | ||
151 | assert infer_storage_options("./test.csv")["path"] == "./test.csv" | |
152 | assert infer_storage_options("../test.csv")["path"] == "../test.csv" | |
153 | ||
154 | so = infer_storage_options("C:\\test.csv") | |
155 | assert so.pop("protocol") == "file" | |
156 | assert so.pop("path") == "C:\\test.csv" | |
157 | assert not so | |
158 | ||
159 | assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv" | |
160 | assert infer_storage_options("\\test.csv")["path"] == "\\test.csv" | |
161 | assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv" | |
162 | assert infer_storage_options("test.csv")["path"] == "test.csv" | |
163 | ||
164 | so = infer_storage_options( | |
165 | "hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm", | |
166 | inherit_storage_options={"extra": "value"}, | |
167 | ) | |
168 | assert so.pop("protocol") == "hdfs" | |
169 | assert so.pop("username") == "username" | |
170 | assert so.pop("password") == "pwd" | |
171 | assert so.pop("host") == "Node" | |
172 | assert so.pop("port") == 123 | |
173 | assert so.pop("path") == "/mnt/datasets/test.csv#fragm" | |
174 | assert so.pop("url_query") == "q=1" | |
175 | assert so.pop("url_fragment") == "fragm" | |
176 | assert so.pop("extra") == "value" | |
177 | assert not so | |
178 | ||
179 | so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv") | |
180 | assert so.pop("username") == "User-name" | |
181 | assert so.pop("host") == "Node-name.com" | |
182 | ||
183 | u = "http://127.0.0.1:8080/test.csv" | |
184 | assert infer_storage_options(u) == {"protocol": "http", "path": u} | |
185 | ||
186 | # For s3 and gcs the netloc is actually the bucket name, so we want to | |
187 | # include it in the path. Test that: | |
188 | # - Parsing doesn't lowercase the bucket | |
189 | # - The bucket is included in path | |
190 | for protocol in ["s3", "gcs", "gs"]: | |
191 | options = infer_storage_options("%s://Bucket-name.com/test.csv" % protocol) | |
192 | assert options["path"] == "Bucket-name.com/test.csv" | |
193 | ||
194 | with pytest.raises(KeyError): | |
195 | infer_storage_options("file:///bucket/file.csv", {"path": "collide"}) | |
196 | with pytest.raises(KeyError): | |
197 | infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"}) | |
198 | ||
199 | ||
200 | @pytest.mark.parametrize( | |
201 | "urlpath, expected_path", | |
202 | ( | |
203 | (r"c:\foo\bar", r"c:\foo\bar"), | |
204 | (r"C:\\foo\bar", r"C:\\foo\bar"), | |
205 | (r"c:/foo/bar", r"c:/foo/bar"), | |
206 | (r"file:///c|\foo\bar", r"c:\foo\bar"), | |
207 | (r"file:///C|/foo/bar", r"C:/foo/bar"), | |
208 | (r"file:///C:/foo/bar", r"C:/foo/bar"), | |
209 | ), | |
210 | ) | |
211 | def test_infer_storage_options_c(urlpath, expected_path): | |
212 | so = infer_storage_options(urlpath) | |
213 | assert so["protocol"] == "file" | |
214 | assert so["path"] == expected_path |
0 | class Transaction(object): | |
1 | """Filesystem transaction write context | |
2 | ||
3 | Gathers files for deferred commit or discard, so that several write | |
4 | operations can be finalized semi-atomically. This works by having this | |
5 | instance as the ``.transaction`` attribute of the given filesystem | |
6 | """ | |
7 | ||
8 | def __init__(self, fs): | |
9 | """ | |
10 | Parameters | |
11 | ---------- | |
12 | fs: FileSystem instance | |
13 | """ | |
14 | self.fs = fs | |
15 | self.files = [] | |
16 | ||
17 | def __enter__(self): | |
18 | self.start() | |
19 | ||
20 | def __exit__(self, exc_type, exc_val, exc_tb): | |
21 | """End transaction and commit, if exit is not due to exception""" | |
22 | # only commit if there was no exception | |
23 | self.complete(commit=exc_type is None) | |
24 | self.fs._intrans = False | |
25 | self.fs._transaction = None | |
26 | ||
27 | def start(self): | |
28 | """Start a transaction on this FileSystem""" | |
29 | self.fs._intrans = True | |
30 | ||
31 | def complete(self, commit=True): | |
32 | """Finish transaction: commit or discard all deferred files""" | |
33 | for f in self.files: | |
34 | if commit: | |
35 | f.commit() | |
36 | else: | |
37 | f.discard() | |
38 | self.files = [] | |
39 | self.fs._intrans = False | |
40 | ||
41 | ||
42 | class FileActor(object): | |
43 | def __init__(self): | |
44 | self.files = [] | |
45 | ||
46 | def commit(self): | |
47 | for f in self.files: | |
48 | f.commit() | |
49 | self.files.clear() | |
50 | ||
51 | def discard(self): | |
52 | for f in self.files: | |
53 | f.discard() | |
54 | self.files.clear() | |
55 | ||
56 | def append(self, f): | |
57 | self.files.append(f) | |
58 | ||
59 | ||
60 | class DaskTransaction(Transaction): | |
61 | def __init__(self, fs): | |
62 | """ | |
63 | Parameters | |
64 | ---------- | |
65 | fs: FileSystem instance | |
66 | """ | |
67 | import distributed | |
68 | ||
69 | super().__init__(fs) | |
70 | client = distributed.default_client() | |
71 | self.files = client.submit(FileActor, actor=True).result() | |
72 | ||
73 | def complete(self, commit=True): | |
74 | """Finish transaction: commit or discard all deferred files""" | |
75 | if commit: | |
76 | self.files.commit().result() | |
77 | else: | |
78 | self.files.discard().result() | |
79 | self.fs._intrans = False |
0 | from hashlib import md5 | |
1 | import math | |
2 | import os | |
3 | import pathlib | |
4 | import re | |
5 | from urllib.parse import urlsplit | |
6 | ||
7 | ||
8 | DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 | |
9 | ||
10 | ||
11 | def infer_storage_options(urlpath, inherit_storage_options=None): | |
12 | """ Infer storage options from URL path and merge it with existing storage | |
13 | options. | |
14 | ||
15 | Parameters | |
16 | ---------- | |
17 | urlpath: str or unicode | |
18 | Either local absolute file path or URL (hdfs://namenode:8020/file.csv) | |
19 | inherit_storage_options: dict (optional) | |
20 | Its contents will get merged with the inferred information from the | |
21 | given path | |
22 | ||
23 | Returns | |
24 | ------- | |
25 | Storage options dict. | |
26 | ||
27 | Examples | |
28 | -------- | |
29 | >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP | |
30 | {"protocol": "file", "path", "/mnt/datasets/test.csv"} | |
31 | >>> infer_storage_options( | |
32 | ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1', | |
33 | ... inherit_storage_options={'extra': 'value'}) # doctest: +SKIP | |
34 | {"protocol": "hdfs", "username": "username", "password": "pwd", | |
35 | "host": "node", "port": 123, "path": "/mnt/datasets/test.csv", | |
36 | "url_query": "q=1", "extra": "value"} | |
37 | """ | |
38 | # Handle Windows paths including disk name in this special case | |
39 | if re.match(r"^[a-zA-Z]:[\\/]", urlpath): | |
40 | return {"protocol": "file", "path": urlpath} | |
41 | ||
42 | parsed_path = urlsplit(urlpath) | |
43 | protocol = parsed_path.scheme or "file" | |
44 | if parsed_path.fragment: | |
45 | path = "#".join([parsed_path.path, parsed_path.fragment]) | |
46 | else: | |
47 | path = parsed_path.path | |
48 | if protocol == "file": | |
49 | # Special case parsing file protocol URL on Windows according to: | |
50 | # https://msdn.microsoft.com/en-us/library/jj710207.aspx | |
51 | windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) | |
52 | if windows_path: | |
53 | path = "%s:%s" % windows_path.groups() | |
54 | ||
55 | if protocol in ["http", "https"]: | |
56 | # for HTTP, we don't want to parse, as requests will anyway | |
57 | return {"protocol": protocol, "path": urlpath} | |
58 | ||
59 | options = {"protocol": protocol, "path": path} | |
60 | ||
61 | if parsed_path.netloc: | |
62 | # Parse `hostname` from netloc manually because `parsed_path.hostname` | |
63 | # lowercases the hostname which is not always desirable (e.g. in S3): | |
64 | # https://github.com/dask/dask/issues/1417 | |
65 | options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] | |
66 | ||
67 | if protocol in ("s3", "gcs", "gs"): | |
68 | options["path"] = options["host"] + options["path"] | |
69 | else: | |
70 | options["host"] = options["host"] | |
71 | if parsed_path.port: | |
72 | options["port"] = parsed_path.port | |
73 | if parsed_path.username: | |
74 | options["username"] = parsed_path.username | |
75 | if parsed_path.password: | |
76 | options["password"] = parsed_path.password | |
77 | ||
78 | if parsed_path.query: | |
79 | options["url_query"] = parsed_path.query | |
80 | if parsed_path.fragment: | |
81 | options["url_fragment"] = parsed_path.fragment | |
82 | ||
83 | if inherit_storage_options: | |
84 | update_storage_options(options, inherit_storage_options) | |
85 | ||
86 | return options | |
87 | ||
88 | ||
89 | def update_storage_options(options, inherited=None): | |
90 | if not inherited: | |
91 | inherited = {} | |
92 | collisions = set(options) & set(inherited) | |
93 | if collisions: | |
94 | collisions = "\n".join("- %r" % k for k in collisions) | |
95 | raise KeyError( | |
96 | "Collision between inferred and specified storage " | |
97 | "options:\n%s" % collisions | |
98 | ) | |
99 | options.update(inherited) | |
100 | ||
101 | ||
102 | # Compression extensions registered via fsspec.compression.register_compression | |
103 | compressions = {} | |
104 | ||
105 | ||
106 | def infer_compression(filename): | |
107 | """Infer compression, if available, from filename. | |
108 | ||
109 | Infer a named compression type, if registered and available, from filename | |
110 | extension. This includes builtin (gz, bz2, zip) compressions, as well as | |
111 | optional compressions. See fsspec.compression.register_compression. | |
112 | """ | |
113 | extension = os.path.splitext(filename)[-1].strip(".") | |
114 | if extension in compressions: | |
115 | return compressions[extension] | |
116 | ||
117 | ||
118 | def build_name_function(max_int): | |
119 | """ Returns a function that receives a single integer | |
120 | and returns it as a string padded by enough zero characters | |
121 | to align with maximum possible integer | |
122 | ||
123 | >>> name_f = build_name_function(57) | |
124 | ||
125 | >>> name_f(7) | |
126 | '07' | |
127 | >>> name_f(31) | |
128 | '31' | |
129 | >>> build_name_function(1000)(42) | |
130 | '0042' | |
131 | >>> build_name_function(999)(42) | |
132 | '042' | |
133 | >>> build_name_function(0)(0) | |
134 | '0' | |
135 | """ | |
136 | # handle corner cases max_int is 0 or exact power of 10 | |
137 | max_int += 1e-8 | |
138 | ||
139 | pad_length = int(math.ceil(math.log10(max_int))) | |
140 | ||
141 | def name_function(i): | |
142 | return str(i).zfill(pad_length) | |
143 | ||
144 | return name_function | |
145 | ||
146 | ||
147 | def seek_delimiter(file, delimiter, blocksize): | |
148 | r"""Seek current file to file start, file end, or byte after delimiter seq. | |
149 | ||
150 | Seeks file to next chunk delimiter, where chunks are defined on file start, | |
151 | a delimiting sequence, and file end. Use file.tell() to see location afterwards. | |
152 | Note that file start is a valid split, so must be at offset > 0 to seek for | |
153 | delimiter. | |
154 | ||
155 | Parameters | |
156 | ---------- | |
157 | file: a file | |
158 | delimiter: bytes | |
159 | a delimiter like ``b'\n'`` or message sentinel, matching file .read() type | |
160 | blocksize: int | |
161 | Number of bytes to read from the file at once. | |
162 | ||
163 | ||
164 | Returns | |
165 | ------- | |
166 | Returns True if a delimiter was found, False if at file start or end. | |
167 | ||
168 | """ | |
169 | ||
170 | if file.tell() == 0: | |
171 | # beginning-of-file, return without seek | |
172 | return False | |
173 | ||
174 | # Interface is for binary IO, with delimiter as bytes, but initialize last | |
175 | # with result of file.read to preserve compatibility with text IO. | |
176 | last = None | |
177 | while True: | |
178 | current = file.read(blocksize) | |
179 | if not current: | |
180 | # end-of-file without delimiter | |
181 | return False | |
182 | full = last + current if last else current | |
183 | try: | |
184 | if delimiter in full: | |
185 | i = full.index(delimiter) | |
186 | file.seek(file.tell() - (len(full) - i) + len(delimiter)) | |
187 | return True | |
188 | elif len(current) < blocksize: | |
189 | # end-of-file without delimiter | |
190 | return False | |
191 | except (OSError, ValueError): | |
192 | pass | |
193 | last = full[-len(delimiter) :] | |
194 | ||
195 | ||
196 | def read_block(f, offset, length, delimiter=None, split_before=False): | |
197 | """ Read a block of bytes from a file | |
198 | ||
199 | Parameters | |
200 | ---------- | |
201 | f: File | |
202 | Open file | |
203 | offset: int | |
204 | Byte offset to start read | |
205 | length: int | |
206 | Number of bytes to read, read through end of file if None | |
207 | delimiter: bytes (optional) | |
208 | Ensure reading starts and stops at delimiter bytestring | |
209 | split_before: bool (optional) | |
210 | Start/stop read *before* delimiter bytestring. | |
211 | ||
212 | ||
213 | If using the ``delimiter=`` keyword argument we ensure that the read | |
214 | starts and stops at delimiter boundaries that follow the locations | |
215 | ``offset`` and ``offset + length``. If ``offset`` is zero then we | |
216 | start at zero, regardless of delimiter. The bytestring returned WILL | |
217 | include the terminating delimiter string. | |
218 | ||
219 | Examples | |
220 | -------- | |
221 | ||
222 | >>> from io import BytesIO # doctest: +SKIP | |
223 | >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP | |
224 | >>> read_block(f, 0, 13) # doctest: +SKIP | |
225 | b'Alice, 100\\nBo' | |
226 | ||
227 | >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP | |
228 | b'Alice, 100\\nBob, 200\\n' | |
229 | ||
230 | >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP | |
231 | b'Bob, 200\\nCharlie, 300' | |
232 | """ | |
233 | if delimiter: | |
234 | f.seek(offset) | |
235 | found_start_delim = seek_delimiter(f, delimiter, 2 ** 16) | |
236 | if length is None: | |
237 | return f.read() | |
238 | start = f.tell() | |
239 | length -= start - offset | |
240 | ||
241 | f.seek(start + length) | |
242 | found_end_delim = seek_delimiter(f, delimiter, 2 ** 16) | |
243 | end = f.tell() | |
244 | ||
245 | # Adjust split location to before delimiter iff seek found the | |
246 | # delimiter sequence, not start or end of file. | |
247 | if found_start_delim and split_before: | |
248 | start -= len(delimiter) | |
249 | ||
250 | if found_end_delim and split_before: | |
251 | end -= len(delimiter) | |
252 | ||
253 | offset = start | |
254 | length = end - start | |
255 | ||
256 | f.seek(offset) | |
257 | b = f.read(length) | |
258 | return b | |
259 | ||
260 | ||
261 | def tokenize(*args, **kwargs): | |
262 | """ Deterministic token | |
263 | ||
264 | (modified from dask.base) | |
265 | ||
266 | >>> tokenize([1, 2, '3']) | |
267 | '9d71491b50023b06fc76928e6eddb952' | |
268 | ||
269 | >>> tokenize('Hello') == tokenize('Hello') | |
270 | True | |
271 | """ | |
272 | if kwargs: | |
273 | args += (kwargs,) | |
274 | return md5(str(args).encode()).hexdigest() | |
275 | ||
276 | ||
277 | def stringify_path(filepath): | |
278 | """ Attempt to convert a path-like object to a string. | |
279 | ||
280 | Parameters | |
281 | ---------- | |
282 | filepath: object to be converted | |
283 | ||
284 | Returns | |
285 | ------- | |
286 | filepath_str: maybe a string version of the object | |
287 | ||
288 | Notes | |
289 | ----- | |
290 | Objects supporting the fspath protocol (Python 3.6+) are coerced | |
291 | according to its __fspath__ method. | |
292 | ||
293 | For backwards compatibility with older Python version, pathlib.Path | |
294 | objects are specially coerced. | |
295 | ||
296 | Any other object is passed through unchanged, which includes bytes, | |
297 | strings, buffers, or anything else that's not even path-like. | |
298 | """ | |
299 | if hasattr(filepath, "__fspath__"): | |
300 | return filepath.__fspath__() | |
301 | elif isinstance(filepath, pathlib.Path): | |
302 | return str(filepath) | |
303 | return filepath |
0 | [tool.black] | |
1 | # Revert to py34 target syntax to accomodate | |
2 | # errors in trailing commas. | |
3 | # https://github.com/psf/black/pull/763 | |
4 | target_version = ['py34'] |
0 | [metadata] | |
1 | long_description: file: README.rst | |
2 | ||
3 | [versioneer] | |
4 | VCS = git | |
5 | style = pep440 | |
6 | versionfile_source = fsspec/_version.py | |
7 | versionfile_build = fsspec/_version.py | |
8 | tag_prefix = "" | |
9 | ||
10 | [flake8] | |
11 | exclude = .tox,build,docs/source/conf.py,versioneer.py | |
12 | max-line-length = 88 | |
13 | ignore = | |
14 | # Assigning lambda expression | |
15 | E731 | |
16 | # Ambiguous variable names | |
17 | E741 | |
18 | # line break before binary operator | |
19 | W503 | |
20 | # whitespace before : | |
21 | E203 |
0 | #!/usr/bin/env python | |
1 | import os | |
2 | ||
3 | from setuptools import setup | |
4 | import versioneer | |
5 | ||
6 | here = os.path.abspath(os.path.dirname(__file__)) | |
7 | with open(os.path.join(here, "README.md"), encoding="utf-8") as f: | |
8 | long_description = f.read() | |
9 | ||
10 | setup( | |
11 | name="fsspec", | |
12 | version=versioneer.get_version(), | |
13 | cmdclass=versioneer.get_cmdclass(), | |
14 | classifiers=[ | |
15 | "Development Status :: 4 - Beta", | |
16 | "Intended Audience :: Developers", | |
17 | "License :: OSI Approved :: BSD License", | |
18 | "Operating System :: OS Independent", | |
19 | "Programming Language :: Python :: 3.5", | |
20 | "Programming Language :: Python :: 3.6", | |
21 | "Programming Language :: Python :: 3.7", | |
22 | ], | |
23 | description="File-system specification", | |
24 | long_description=long_description, | |
25 | long_description_content_type="text/markdown", | |
26 | url="http://github.com/intake/filesystem_spec", | |
27 | maintainer="Martin Durant", | |
28 | maintainer_email="mdurant@anaconda.com", | |
29 | license="BSD", | |
30 | keywords="file", | |
31 | packages=["fsspec", "fsspec.implementations"], | |
32 | python_requires=">=3.5", | |
33 | install_requires=open("requirements.txt").read().strip().split("\n"), | |
34 | zip_safe=False, | |
35 | ) |
0 | # content of: tox.ini , put in same dir as setup.py | |
1 | [tox] | |
2 | envlist = {py35,py36,py37} | |
3 | ||
4 | [core] | |
5 | conda_channels= | |
6 | defaults | |
7 | conda-forge | |
8 | conda_deps= | |
9 | pip | |
10 | paramiko | |
11 | requests | |
12 | zstandard | |
13 | python-snappy | |
14 | lz4 | |
15 | distributed | |
16 | dask | |
17 | pyarrow | |
18 | pyftpdlib | |
19 | cloudpickle | |
20 | pytest | |
21 | pytest-cov | |
22 | fusepy==3.0.1 | |
23 | deps= | |
24 | hadoop-test-cluster==0.1.0 | |
25 | ||
26 | [dev] | |
27 | conda_deps= | |
28 | conda-forge::pre-commit=1.18 | |
29 | black=19.3b0 | |
30 | flake8 | |
31 | deps= | |
32 | ||
33 | [testenv] | |
34 | description=Run test suite against target versions. | |
35 | conda_channels= | |
36 | {[core]conda_channels} | |
37 | conda_deps= | |
38 | {[core]conda_deps} | |
39 | deps= | |
40 | {[core]deps} | |
41 | commands = | |
42 | py.test -v -r s | |
43 | ||
44 | [testenv:coverage] | |
45 | description=Run test suite with coverage enabled. | |
46 | basepython=python3.7 | |
47 | conda_channels= | |
48 | {[core]conda_channels} | |
49 | conda_deps= | |
50 | {[core]conda_deps} | |
51 | deps= | |
52 | {[core]deps} | |
53 | commands = | |
54 | py.test --cov=fsspec -v -r s | |
55 | ||
56 | [testenv:dev] | |
57 | description=Setup conda dev env under '.tox/dev'. | |
58 | basepython=python3.7 | |
59 | usedevelop=True | |
60 | conda_channels= | |
61 | {[core]conda_channels} | |
62 | conda_deps= | |
63 | {[core]conda_deps} | |
64 | {[dev]conda_deps} | |
65 | deps= | |
66 | {[core]deps} | |
67 | {[dev]deps} | |
68 | commands = | |
69 | ||
70 | [testenv:lint] | |
71 | description=Run pre-commit checks. | |
72 | basepython=python3.7 | |
73 | skip_install=True | |
74 | conda_deps= | |
75 | {[dev]conda_deps} | |
76 | deps= | |
77 | {[dev]deps} | |
78 | commands_pre= | |
79 | pre-commit install --install-hooks | |
80 | commands= | |
81 | pre-commit run --all-files --show-diff-on-failure | |
82 | ||
83 | [testenv:s3fs] | |
84 | description=Run s3fs (@master) test suite against fsspec. | |
85 | conda_channels= | |
86 | defaults | |
87 | conda-forge | |
88 | conda_deps= | |
89 | {[core]conda_deps} | |
90 | boto3 | |
91 | botocore | |
92 | httpretty | |
93 | moto | |
94 | six | |
95 | mock | |
96 | deps= | |
97 | {[core]deps} | |
98 | changedir=.tox/s3fs/tmp | |
99 | whitelist_externals= | |
100 | rm | |
101 | git | |
102 | setenv= | |
103 | BOTO_CONFIG=/dev/null | |
104 | AWS_ACCESS_KEY_ID=foobar_key | |
105 | AWS_SECRET_ACCESS_KEY=foobar_secret | |
106 | commands= | |
107 | rm -rf s3fs | |
108 | git clone https://github.com/dask/s3fs | |
109 | py.test -vv s3fs/s3fs | |
110 | ||
111 | [testenv:gcsfs] | |
112 | description=Run gcsfs (@master) test suite against fsspec. | |
113 | conda_channels= | |
114 | defaults | |
115 | conda-forge | |
116 | conda_deps= | |
117 | {[core]conda_deps} | |
118 | requests | |
119 | decorator | |
120 | google-auth | |
121 | deps= | |
122 | {[core]deps} | |
123 | vcrpy | |
124 | google-auth-oauthlib | |
125 | changedir=.tox/gcsfs/tmp | |
126 | whitelist_externals= | |
127 | rm | |
128 | git | |
129 | setenv= | |
130 | GCSFS_RECORD_MODE=none | |
131 | commands= | |
132 | rm -rf gcsfs | |
133 | git clone https://github.com/dask/gcsfs | |
134 | py.test -vv gcsfs/gcsfs -k 'not fuse' |
0 | # Version: 0.18 | |
1 | ||
2 | """The Versioneer - like a rocketeer, but for versions. | |
3 | ||
4 | The Versioneer | |
5 | ============== | |
6 | ||
7 | * like a rocketeer, but for versions! | |
8 | * https://github.com/warner/python-versioneer | |
9 | * Brian Warner | |
10 | * License: Public Domain | |
11 | * Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy | |
12 | * [![Latest Version] | |
13 | (https://pypip.in/version/versioneer/badge.svg?style=flat) | |
14 | ](https://pypi.python.org/pypi/versioneer/) | |
15 | * [![Build Status] | |
16 | (https://travis-ci.org/warner/python-versioneer.png?branch=master) | |
17 | ](https://travis-ci.org/warner/python-versioneer) | |
18 | ||
19 | This is a tool for managing a recorded version number in distutils-based | |
20 | python projects. The goal is to remove the tedious and error-prone "update | |
21 | the embedded version string" step from your release process. Making a new | |
22 | release should be as easy as recording a new tag in your version-control | |
23 | system, and maybe making new tarballs. | |
24 | ||
25 | ||
26 | ## Quick Install | |
27 | ||
28 | * `pip install versioneer` to somewhere to your $PATH | |
29 | * add a `[versioneer]` section to your setup.cfg (see below) | |
30 | * run `versioneer install` in your source tree, commit the results | |
31 | ||
32 | ## Version Identifiers | |
33 | ||
34 | Source trees come from a variety of places: | |
35 | ||
36 | * a version-control system checkout (mostly used by developers) | |
37 | * a nightly tarball, produced by build automation | |
38 | * a snapshot tarball, produced by a web-based VCS browser, like github's | |
39 | "tarball from tag" feature | |
40 | * a release tarball, produced by "setup.py sdist", distributed through PyPI | |
41 | ||
42 | Within each source tree, the version identifier (either a string or a number, | |
43 | this tool is format-agnostic) can come from a variety of places: | |
44 | ||
45 | * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows | |
46 | about recent "tags" and an absolute revision-id | |
47 | * the name of the directory into which the tarball was unpacked | |
48 | * an expanded VCS keyword ($Id$, etc) | |
49 | * a `_version.py` created by some earlier build step | |
50 | ||
51 | For released software, the version identifier is closely related to a VCS | |
52 | tag. Some projects use tag names that include more than just the version | |
53 | string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool | |
54 | needs to strip the tag prefix to extract the version identifier. For | |
55 | unreleased software (between tags), the version identifier should provide | |
56 | enough information to help developers recreate the same tree, while also | |
57 | giving them an idea of roughly how old the tree is (after version 1.2, before | |
58 | version 1.3). Many VCS systems can report a description that captures this, | |
59 | for example `git describe --tags --dirty --always` reports things like | |
60 | "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the | |
61 | 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has | |
62 | uncommitted changes. | |
63 | ||
64 | The version identifier is used for multiple purposes: | |
65 | ||
66 | * to allow the module to self-identify its version: `myproject.__version__` | |
67 | * to choose a name and prefix for a 'setup.py sdist' tarball | |
68 | ||
69 | ## Theory of Operation | |
70 | ||
71 | Versioneer works by adding a special `_version.py` file into your source | |
72 | tree, where your `__init__.py` can import it. This `_version.py` knows how to | |
73 | dynamically ask the VCS tool for version information at import time. | |
74 | ||
75 | `_version.py` also contains `$Revision$` markers, and the installation | |
76 | process marks `_version.py` to have this marker rewritten with a tag name | |
77 | during the `git archive` command. As a result, generated tarballs will | |
78 | contain enough information to get the proper version. | |
79 | ||
80 | To allow `setup.py` to compute a version too, a `versioneer.py` is added to | |
81 | the top level of your source tree, next to `setup.py` and the `setup.cfg` | |
82 | that configures it. This overrides several distutils/setuptools commands to | |
83 | compute the version when invoked, and changes `setup.py build` and `setup.py | |
84 | sdist` to replace `_version.py` with a small static file that contains just | |
85 | the generated version data. | |
86 | ||
87 | ## Installation | |
88 | ||
89 | See [INSTALL.md](./INSTALL.md) for detailed installation instructions. | |
90 | ||
91 | ## Version-String Flavors | |
92 | ||
93 | Code which uses Versioneer can learn about its version string at runtime by | |
94 | importing `_version` from your main `__init__.py` file and running the | |
95 | `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can | |
96 | import the top-level `versioneer.py` and run `get_versions()`. | |
97 | ||
98 | Both functions return a dictionary with different flavors of version | |
99 | information: | |
100 | ||
101 | * `['version']`: A condensed version string, rendered using the selected | |
102 | style. This is the most commonly used value for the project's version | |
103 | string. The default "pep440" style yields strings like `0.11`, | |
104 | `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section | |
105 | below for alternative styles. | |
106 | ||
107 | * `['full-revisionid']`: detailed revision identifier. For Git, this is the | |
108 | full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". | |
109 | ||
110 | * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the | |
111 | commit date in ISO 8601 format. This will be None if the date is not | |
112 | available. | |
113 | ||
114 | * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that | |
115 | this is only accurate if run in a VCS checkout, otherwise it is likely to | |
116 | be False or None | |
117 | ||
118 | * `['error']`: if the version string could not be computed, this will be set | |
119 | to a string describing the problem, otherwise it will be None. It may be | |
120 | useful to throw an exception in setup.py if this is set, to avoid e.g. | |
121 | creating tarballs with a version string of "unknown". | |
122 | ||
123 | Some variants are more useful than others. Including `full-revisionid` in a | |
124 | bug report should allow developers to reconstruct the exact code being tested | |
125 | (or indicate the presence of local changes that should be shared with the | |
126 | developers). `version` is suitable for display in an "about" box or a CLI | |
127 | `--version` output: it can be easily compared against release notes and lists | |
128 | of bugs fixed in various releases. | |
129 | ||
130 | The installer adds the following text to your `__init__.py` to place a basic | |
131 | version in `YOURPROJECT.__version__`: | |
132 | ||
133 | from ._version import get_versions | |
134 | __version__ = get_versions()['version'] | |
135 | del get_versions | |
136 | ||
137 | ## Styles | |
138 | ||
139 | The setup.cfg `style=` configuration controls how the VCS information is | |
140 | rendered into a version string. | |
141 | ||
142 | The default style, "pep440", produces a PEP440-compliant string, equal to the | |
143 | un-prefixed tag name for actual releases, and containing an additional "local | |
144 | version" section with more detail for in-between builds. For Git, this is | |
145 | TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags | |
146 | --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the | |
147 | tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and | |
148 | that this commit is two revisions ("+2") beyond the "0.11" tag. For released | |
149 | software (exactly equal to a known tag), the identifier will only contain the | |
150 | stripped tag, e.g. "0.11". | |
151 | ||
152 | Other styles are available. See [details.md](details.md) in the Versioneer | |
153 | source tree for descriptions. | |
154 | ||
155 | ## Debugging | |
156 | ||
157 | Versioneer tries to avoid fatal errors: if something goes wrong, it will tend | |
158 | to return a version of "0+unknown". To investigate the problem, run `setup.py | |
159 | version`, which will run the version-lookup code in a verbose mode, and will | |
160 | display the full contents of `get_versions()` (including the `error` string, | |
161 | which may help identify what went wrong). | |
162 | ||
163 | ## Known Limitations | |
164 | ||
165 | Some situations are known to cause problems for Versioneer. This details the | |
166 | most significant ones. More can be found on Github | |
167 | [issues page](https://github.com/warner/python-versioneer/issues). | |
168 | ||
169 | ### Subprojects | |
170 | ||
171 | Versioneer has limited support for source trees in which `setup.py` is not in | |
172 | the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are | |
173 | two common reasons why `setup.py` might not be in the root: | |
174 | ||
175 | * Source trees which contain multiple subprojects, such as | |
176 | [Buildbot](https://github.com/buildbot/buildbot), which contains both | |
177 | "master" and "slave" subprojects, each with their own `setup.py`, | |
178 | `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI | |
179 | distributions (and upload multiple independently-installable tarballs). | |
180 | * Source trees whose main purpose is to contain a C library, but which also | |
181 | provide bindings to Python (and perhaps other langauges) in subdirectories. | |
182 | ||
183 | Versioneer will look for `.git` in parent directories, and most operations | |
184 | should get the right version string. However `pip` and `setuptools` have bugs | |
185 | and implementation details which frequently cause `pip install .` from a | |
186 | subproject directory to fail to find a correct version string (so it usually | |
187 | defaults to `0+unknown`). | |
188 | ||
189 | `pip install --editable .` should work correctly. `setup.py install` might | |
190 | work too. | |
191 | ||
192 | Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in | |
193 | some later version. | |
194 | ||
195 | [Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking | |
196 | this issue. The discussion in | |
197 | [PR #61](https://github.com/warner/python-versioneer/pull/61) describes the | |
198 | issue from the Versioneer side in more detail. | |
199 | [pip PR#3176](https://github.com/pypa/pip/pull/3176) and | |
200 | [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve | |
201 | pip to let Versioneer work correctly. | |
202 | ||
203 | Versioneer-0.16 and earlier only looked for a `.git` directory next to the | |
204 | `setup.cfg`, so subprojects were completely unsupported with those releases. | |
205 | ||
206 | ### Editable installs with setuptools <= 18.5 | |
207 | ||
208 | `setup.py develop` and `pip install --editable .` allow you to install a | |
209 | project into a virtualenv once, then continue editing the source code (and | |
210 | test) without re-installing after every change. | |
211 | ||
212 | "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a | |
213 | convenient way to specify executable scripts that should be installed along | |
214 | with the python package. | |
215 | ||
216 | These both work as expected when using modern setuptools. When using | |
217 | setuptools-18.5 or earlier, however, certain operations will cause | |
218 | `pkg_resources.DistributionNotFound` errors when running the entrypoint | |
219 | script, which must be resolved by re-installing the package. This happens | |
220 | when the install happens with one version, then the egg_info data is | |
221 | regenerated while a different version is checked out. Many setup.py commands | |
222 | cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into | |
223 | a different virtualenv), so this can be surprising. | |
224 | ||
225 | [Bug #83](https://github.com/warner/python-versioneer/issues/83) describes | |
226 | this one, but upgrading to a newer version of setuptools should probably | |
227 | resolve it. | |
228 | ||
229 | ### Unicode version strings | |
230 | ||
231 | While Versioneer works (and is continually tested) with both Python 2 and | |
232 | Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. | |
233 | Newer releases probably generate unicode version strings on py2. It's not | |
234 | clear that this is wrong, but it may be surprising for applications when then | |
235 | write these strings to a network connection or include them in bytes-oriented | |
236 | APIs like cryptographic checksums. | |
237 | ||
238 | [Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates | |
239 | this question. | |
240 | ||
241 | ||
242 | ## Updating Versioneer | |
243 | ||
244 | To upgrade your project to a new release of Versioneer, do the following: | |
245 | ||
246 | * install the new Versioneer (`pip install -U versioneer` or equivalent) | |
247 | * edit `setup.cfg`, if necessary, to include any new configuration settings | |
248 | indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. | |
249 | * re-run `versioneer install` in your source tree, to replace | |
250 | `SRC/_version.py` | |
251 | * commit any changed files | |
252 | ||
253 | ## Future Directions | |
254 | ||
255 | This tool is designed to make it easily extended to other version-control | |
256 | systems: all VCS-specific components are in separate directories like | |
257 | src/git/ . The top-level `versioneer.py` script is assembled from these | |
258 | components by running make-versioneer.py . In the future, make-versioneer.py | |
259 | will take a VCS name as an argument, and will construct a version of | |
260 | `versioneer.py` that is specific to the given VCS. It might also take the | |
261 | configuration arguments that are currently provided manually during | |
262 | installation by editing setup.py . Alternatively, it might go the other | |
263 | direction and include code from all supported VCS systems, reducing the | |
264 | number of intermediate scripts. | |
265 | ||
266 | ||
267 | ## License | |
268 | ||
269 | To make Versioneer easier to embed, all its code is dedicated to the public | |
270 | domain. The `_version.py` that it creates is also in the public domain. | |
271 | Specifically, both are released under the Creative Commons "Public Domain | |
272 | Dedication" license (CC0-1.0), as described in | |
273 | https://creativecommons.org/publicdomain/zero/1.0/ . | |
274 | ||
275 | """ | |
276 | ||
277 | from __future__ import print_function | |
278 | ||
279 | try: | |
280 | import configparser | |
281 | except ImportError: | |
282 | import ConfigParser as configparser | |
283 | import errno | |
284 | import json | |
285 | import os | |
286 | import re | |
287 | import subprocess | |
288 | import sys | |
289 | ||
290 | ||
291 | class VersioneerConfig: | |
292 | """Container for Versioneer configuration parameters.""" | |
293 | ||
294 | ||
295 | def get_root(): | |
296 | """Get the project root directory. | |
297 | ||
298 | We require that all commands are run from the project root, i.e. the | |
299 | directory that contains setup.py, setup.cfg, and versioneer.py . | |
300 | """ | |
301 | root = os.path.realpath(os.path.abspath(os.getcwd())) | |
302 | setup_py = os.path.join(root, "setup.py") | |
303 | versioneer_py = os.path.join(root, "versioneer.py") | |
304 | if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): | |
305 | # allow 'python path/to/setup.py COMMAND' | |
306 | root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) | |
307 | setup_py = os.path.join(root, "setup.py") | |
308 | versioneer_py = os.path.join(root, "versioneer.py") | |
309 | if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): | |
310 | err = ( | |
311 | "Versioneer was unable to run the project root directory. " | |
312 | "Versioneer requires setup.py to be executed from " | |
313 | "its immediate directory (like 'python setup.py COMMAND'), " | |
314 | "or in a way that lets it use sys.argv[0] to find the root " | |
315 | "(like 'python path/to/setup.py COMMAND')." | |
316 | ) | |
317 | raise VersioneerBadRootError(err) | |
318 | try: | |
319 | # Certain runtime workflows (setup.py install/develop in a setuptools | |
320 | # tree) execute all dependencies in a single python process, so | |
321 | # "versioneer" may be imported multiple times, and python's shared | |
322 | # module-import table will cache the first one. So we can't use | |
323 | # os.path.dirname(__file__), as that will find whichever | |
324 | # versioneer.py was first imported, even in later projects. | |
325 | me = os.path.realpath(os.path.abspath(__file__)) | |
326 | me_dir = os.path.normcase(os.path.splitext(me)[0]) | |
327 | vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) | |
328 | if me_dir != vsr_dir: | |
329 | print( | |
330 | "Warning: build in %s is using versioneer.py from %s" | |
331 | % (os.path.dirname(me), versioneer_py) | |
332 | ) | |
333 | except NameError: | |
334 | pass | |
335 | return root | |
336 | ||
337 | ||
338 | def get_config_from_root(root): | |
339 | """Read the project setup.cfg file to determine Versioneer config.""" | |
340 | # This might raise EnvironmentError (if setup.cfg is missing), or | |
341 | # configparser.NoSectionError (if it lacks a [versioneer] section), or | |
342 | # configparser.NoOptionError (if it lacks "VCS="). See the docstring at | |
343 | # the top of versioneer.py for instructions on writing your setup.cfg . | |
344 | setup_cfg = os.path.join(root, "setup.cfg") | |
345 | parser = configparser.SafeConfigParser() | |
346 | with open(setup_cfg, "r") as f: | |
347 | parser.readfp(f) | |
348 | VCS = parser.get("versioneer", "VCS") # mandatory | |
349 | ||
350 | def get(parser, name): | |
351 | if parser.has_option("versioneer", name): | |
352 | return parser.get("versioneer", name) | |
353 | return None | |
354 | ||
355 | cfg = VersioneerConfig() | |
356 | cfg.VCS = VCS | |
357 | cfg.style = get(parser, "style") or "" | |
358 | cfg.versionfile_source = get(parser, "versionfile_source") | |
359 | cfg.versionfile_build = get(parser, "versionfile_build") | |
360 | cfg.tag_prefix = get(parser, "tag_prefix") | |
361 | if cfg.tag_prefix in ("''", '""'): | |
362 | cfg.tag_prefix = "" | |
363 | cfg.parentdir_prefix = get(parser, "parentdir_prefix") | |
364 | cfg.verbose = get(parser, "verbose") | |
365 | return cfg | |
366 | ||
367 | ||
368 | class NotThisMethod(Exception): | |
369 | """Exception raised if a method is not valid for the current scenario.""" | |
370 | ||
371 | ||
372 | # these dictionaries contain VCS-specific tools | |
373 | LONG_VERSION_PY = {} | |
374 | HANDLERS = {} | |
375 | ||
376 | ||
377 | def register_vcs_handler(vcs, method): # decorator | |
378 | """Decorator to mark a method as the handler for a particular VCS.""" | |
379 | ||
380 | def decorate(f): | |
381 | """Store f in HANDLERS[vcs][method].""" | |
382 | if vcs not in HANDLERS: | |
383 | HANDLERS[vcs] = {} | |
384 | HANDLERS[vcs][method] = f | |
385 | return f | |
386 | ||
387 | return decorate | |
388 | ||
389 | ||
390 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): | |
391 | """Call the given command(s).""" | |
392 | assert isinstance(commands, list) | |
393 | p = None | |
394 | for c in commands: | |
395 | try: | |
396 | dispcmd = str([c] + args) | |
397 | # remember shell=False, so use git.cmd on windows, not just git | |
398 | p = subprocess.Popen( | |
399 | [c] + args, | |
400 | cwd=cwd, | |
401 | env=env, | |
402 | stdout=subprocess.PIPE, | |
403 | stderr=(subprocess.PIPE if hide_stderr else None), | |
404 | ) | |
405 | break | |
406 | except EnvironmentError: | |
407 | e = sys.exc_info()[1] | |
408 | if e.errno == errno.ENOENT: | |
409 | continue | |
410 | if verbose: | |
411 | print("unable to run %s" % dispcmd) | |
412 | print(e) | |
413 | return None, None | |
414 | else: | |
415 | if verbose: | |
416 | print("unable to find command, tried %s" % (commands,)) | |
417 | return None, None | |
418 | stdout = p.communicate()[0].strip() | |
419 | if sys.version_info[0] >= 3: | |
420 | stdout = stdout.decode() | |
421 | if p.returncode != 0: | |
422 | if verbose: | |
423 | print("unable to run %s (error)" % dispcmd) | |
424 | print("stdout was %s" % stdout) | |
425 | return None, p.returncode | |
426 | return stdout, p.returncode | |
427 | ||
428 | ||
429 | LONG_VERSION_PY[ | |
430 | "git" | |
431 | ] = ''' | |
432 | # This file helps to compute a version number in source trees obtained from | |
433 | # git-archive tarball (such as those provided by githubs download-from-tag | |
434 | # feature). Distribution tarballs (built by setup.py sdist) and build | |
435 | # directories (produced by setup.py build) will contain a much shorter file | |
436 | # that just contains the computed version number. | |
437 | ||
438 | # This file is released into the public domain. Generated by | |
439 | # versioneer-0.18 (https://github.com/warner/python-versioneer) | |
440 | ||
441 | """Git implementation of _version.py.""" | |
442 | ||
443 | import errno | |
444 | import os | |
445 | import re | |
446 | import subprocess | |
447 | import sys | |
448 | ||
449 | ||
450 | def get_keywords(): | |
451 | """Get the keywords needed to look up the version information.""" | |
452 | # these strings will be replaced by git during git-archive. | |
453 | # setup.py/versioneer.py will grep for the variable names, so they must | |
454 | # each be defined on a line of their own. _version.py will just call | |
455 | # get_keywords(). | |
456 | git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" | |
457 | git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" | |
458 | git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" | |
459 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} | |
460 | return keywords | |
461 | ||
462 | ||
463 | class VersioneerConfig: | |
464 | """Container for Versioneer configuration parameters.""" | |
465 | ||
466 | ||
467 | def get_config(): | |
468 | """Create, populate and return the VersioneerConfig() object.""" | |
469 | # these strings are filled in when 'setup.py versioneer' creates | |
470 | # _version.py | |
471 | cfg = VersioneerConfig() | |
472 | cfg.VCS = "git" | |
473 | cfg.style = "%(STYLE)s" | |
474 | cfg.tag_prefix = "%(TAG_PREFIX)s" | |
475 | cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" | |
476 | cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" | |
477 | cfg.verbose = False | |
478 | return cfg | |
479 | ||
480 | ||
481 | class NotThisMethod(Exception): | |
482 | """Exception raised if a method is not valid for the current scenario.""" | |
483 | ||
484 | ||
485 | LONG_VERSION_PY = {} | |
486 | HANDLERS = {} | |
487 | ||
488 | ||
489 | def register_vcs_handler(vcs, method): # decorator | |
490 | """Decorator to mark a method as the handler for a particular VCS.""" | |
491 | def decorate(f): | |
492 | """Store f in HANDLERS[vcs][method].""" | |
493 | if vcs not in HANDLERS: | |
494 | HANDLERS[vcs] = {} | |
495 | HANDLERS[vcs][method] = f | |
496 | return f | |
497 | return decorate | |
498 | ||
499 | ||
500 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, | |
501 | env=None): | |
502 | """Call the given command(s).""" | |
503 | assert isinstance(commands, list) | |
504 | p = None | |
505 | for c in commands: | |
506 | try: | |
507 | dispcmd = str([c] + args) | |
508 | # remember shell=False, so use git.cmd on windows, not just git | |
509 | p = subprocess.Popen([c] + args, cwd=cwd, env=env, | |
510 | stdout=subprocess.PIPE, | |
511 | stderr=(subprocess.PIPE if hide_stderr | |
512 | else None)) | |
513 | break | |
514 | except EnvironmentError: | |
515 | e = sys.exc_info()[1] | |
516 | if e.errno == errno.ENOENT: | |
517 | continue | |
518 | if verbose: | |
519 | print("unable to run %%s" %% dispcmd) | |
520 | print(e) | |
521 | return None, None | |
522 | else: | |
523 | if verbose: | |
524 | print("unable to find command, tried %%s" %% (commands,)) | |
525 | return None, None | |
526 | stdout = p.communicate()[0].strip() | |
527 | if sys.version_info[0] >= 3: | |
528 | stdout = stdout.decode() | |
529 | if p.returncode != 0: | |
530 | if verbose: | |
531 | print("unable to run %%s (error)" %% dispcmd) | |
532 | print("stdout was %%s" %% stdout) | |
533 | return None, p.returncode | |
534 | return stdout, p.returncode | |
535 | ||
536 | ||
537 | def versions_from_parentdir(parentdir_prefix, root, verbose): | |
538 | """Try to determine the version from the parent directory name. | |
539 | ||
540 | Source tarballs conventionally unpack into a directory that includes both | |
541 | the project name and a version string. We will also support searching up | |
542 | two directory levels for an appropriately named parent directory | |
543 | """ | |
544 | rootdirs = [] | |
545 | ||
546 | for i in range(3): | |
547 | dirname = os.path.basename(root) | |
548 | if dirname.startswith(parentdir_prefix): | |
549 | return {"version": dirname[len(parentdir_prefix):], | |
550 | "full-revisionid": None, | |
551 | "dirty": False, "error": None, "date": None} | |
552 | else: | |
553 | rootdirs.append(root) | |
554 | root = os.path.dirname(root) # up a level | |
555 | ||
556 | if verbose: | |
557 | print("Tried directories %%s but none started with prefix %%s" %% | |
558 | (str(rootdirs), parentdir_prefix)) | |
559 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") | |
560 | ||
561 | ||
562 | @register_vcs_handler("git", "get_keywords") | |
563 | def git_get_keywords(versionfile_abs): | |
564 | """Extract version information from the given file.""" | |
565 | # the code embedded in _version.py can just fetch the value of these | |
566 | # keywords. When used from setup.py, we don't want to import _version.py, | |
567 | # so we do it with a regexp instead. This function is not used from | |
568 | # _version.py. | |
569 | keywords = {} | |
570 | try: | |
571 | f = open(versionfile_abs, "r") | |
572 | for line in f.readlines(): | |
573 | if line.strip().startswith("git_refnames ="): | |
574 | mo = re.search(r'=\s*"(.*)"', line) | |
575 | if mo: | |
576 | keywords["refnames"] = mo.group(1) | |
577 | if line.strip().startswith("git_full ="): | |
578 | mo = re.search(r'=\s*"(.*)"', line) | |
579 | if mo: | |
580 | keywords["full"] = mo.group(1) | |
581 | if line.strip().startswith("git_date ="): | |
582 | mo = re.search(r'=\s*"(.*)"', line) | |
583 | if mo: | |
584 | keywords["date"] = mo.group(1) | |
585 | f.close() | |
586 | except EnvironmentError: | |
587 | pass | |
588 | return keywords | |
589 | ||
590 | ||
591 | @register_vcs_handler("git", "keywords") | |
592 | def git_versions_from_keywords(keywords, tag_prefix, verbose): | |
593 | """Get version information from git keywords.""" | |
594 | if not keywords: | |
595 | raise NotThisMethod("no keywords at all, weird") | |
596 | date = keywords.get("date") | |
597 | if date is not None: | |
598 | # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant | |
599 | # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 | |
600 | # -like" string, which we must then edit to make compliant), because | |
601 | # it's been around since git-1.5.3, and it's too difficult to | |
602 | # discover which version we're using, or to work around using an | |
603 | # older one. | |
604 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) | |
605 | refnames = keywords["refnames"].strip() | |
606 | if refnames.startswith("$Format"): | |
607 | if verbose: | |
608 | print("keywords are unexpanded, not using") | |
609 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") | |
610 | refs = set([r.strip() for r in refnames.strip("()").split(",")]) | |
611 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of | |
612 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. | |
613 | TAG = "tag: " | |
614 | tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) | |
615 | if not tags: | |
616 | # Either we're using git < 1.8.3, or there really are no tags. We use | |
617 | # a heuristic: assume all version tags have a digit. The old git %%d | |
618 | # expansion behaves like git log --decorate=short and strips out the | |
619 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish | |
620 | # between branches and tags. By ignoring refnames without digits, we | |
621 | # filter out many common branch names like "release" and | |
622 | # "stabilization", as well as "HEAD" and "master". | |
623 | tags = set([r for r in refs if re.search(r'\d', r)]) | |
624 | if verbose: | |
625 | print("discarding '%%s', no digits" %% ",".join(refs - tags)) | |
626 | if verbose: | |
627 | print("likely tags: %%s" %% ",".join(sorted(tags))) | |
628 | for ref in sorted(tags): | |
629 | # sorting will prefer e.g. "2.0" over "2.0rc1" | |
630 | if ref.startswith(tag_prefix): | |
631 | r = ref[len(tag_prefix):] | |
632 | if verbose: | |
633 | print("picking %%s" %% r) | |
634 | return {"version": r, | |
635 | "full-revisionid": keywords["full"].strip(), | |
636 | "dirty": False, "error": None, | |
637 | "date": date} | |
638 | # no suitable tags, so version is "0+unknown", but full hex is still there | |
639 | if verbose: | |
640 | print("no suitable tags, using unknown + full revision id") | |
641 | return {"version": "0+unknown", | |
642 | "full-revisionid": keywords["full"].strip(), | |
643 | "dirty": False, "error": "no suitable tags", "date": None} | |
644 | ||
645 | ||
646 | @register_vcs_handler("git", "pieces_from_vcs") | |
647 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): | |
648 | """Get version from 'git describe' in the root of the source tree. | |
649 | ||
650 | This only gets called if the git-archive 'subst' keywords were *not* | |
651 | expanded, and _version.py hasn't already been rewritten with a short | |
652 | version string, meaning we're inside a checked out source tree. | |
653 | """ | |
654 | GITS = ["git"] | |
655 | if sys.platform == "win32": | |
656 | GITS = ["git.cmd", "git.exe"] | |
657 | ||
658 | out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, | |
659 | hide_stderr=True) | |
660 | if rc != 0: | |
661 | if verbose: | |
662 | print("Directory %%s not under git control" %% root) | |
663 | raise NotThisMethod("'git rev-parse --git-dir' returned error") | |
664 | ||
665 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] | |
666 | # if there isn't one, this yields HEX[-dirty] (no NUM) | |
667 | describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", | |
668 | "--always", "--long", | |
669 | "--match", "%%s*" %% tag_prefix], | |
670 | cwd=root) | |
671 | # --long was added in git-1.5.5 | |
672 | if describe_out is None: | |
673 | raise NotThisMethod("'git describe' failed") | |
674 | describe_out = describe_out.strip() | |
675 | full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) | |
676 | if full_out is None: | |
677 | raise NotThisMethod("'git rev-parse' failed") | |
678 | full_out = full_out.strip() | |
679 | ||
680 | pieces = {} | |
681 | pieces["long"] = full_out | |
682 | pieces["short"] = full_out[:7] # maybe improved later | |
683 | pieces["error"] = None | |
684 | ||
685 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] | |
686 | # TAG might have hyphens. | |
687 | git_describe = describe_out | |
688 | ||
689 | # look for -dirty suffix | |
690 | dirty = git_describe.endswith("-dirty") | |
691 | pieces["dirty"] = dirty | |
692 | if dirty: | |
693 | git_describe = git_describe[:git_describe.rindex("-dirty")] | |
694 | ||
695 | # now we have TAG-NUM-gHEX or HEX | |
696 | ||
697 | if "-" in git_describe: | |
698 | # TAG-NUM-gHEX | |
699 | mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) | |
700 | if not mo: | |
701 | # unparseable. Maybe git-describe is misbehaving? | |
702 | pieces["error"] = ("unable to parse git-describe output: '%%s'" | |
703 | %% describe_out) | |
704 | return pieces | |
705 | ||
706 | # tag | |
707 | full_tag = mo.group(1) | |
708 | if not full_tag.startswith(tag_prefix): | |
709 | if verbose: | |
710 | fmt = "tag '%%s' doesn't start with prefix '%%s'" | |
711 | print(fmt %% (full_tag, tag_prefix)) | |
712 | pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" | |
713 | %% (full_tag, tag_prefix)) | |
714 | return pieces | |
715 | pieces["closest-tag"] = full_tag[len(tag_prefix):] | |
716 | ||
717 | # distance: number of commits since tag | |
718 | pieces["distance"] = int(mo.group(2)) | |
719 | ||
720 | # commit: short hex revision ID | |
721 | pieces["short"] = mo.group(3) | |
722 | ||
723 | else: | |
724 | # HEX: no tags | |
725 | pieces["closest-tag"] = None | |
726 | count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], | |
727 | cwd=root) | |
728 | pieces["distance"] = int(count_out) # total number of commits | |
729 | ||
730 | # commit date: see ISO-8601 comment in git_versions_from_keywords() | |
731 | date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], | |
732 | cwd=root)[0].strip() | |
733 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) | |
734 | ||
735 | return pieces | |
736 | ||
737 | ||
738 | def plus_or_dot(pieces): | |
739 | """Return a + if we don't already have one, else return a .""" | |
740 | if "+" in pieces.get("closest-tag", ""): | |
741 | return "." | |
742 | return "+" | |
743 | ||
744 | ||
745 | def render_pep440(pieces): | |
746 | """Build up version string, with post-release "local version identifier". | |
747 | ||
748 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you | |
749 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty | |
750 | ||
751 | Exceptions: | |
752 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] | |
753 | """ | |
754 | if pieces["closest-tag"]: | |
755 | rendered = pieces["closest-tag"] | |
756 | if pieces["distance"] or pieces["dirty"]: | |
757 | rendered += plus_or_dot(pieces) | |
758 | rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) | |
759 | if pieces["dirty"]: | |
760 | rendered += ".dirty" | |
761 | else: | |
762 | # exception #1 | |
763 | rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], | |
764 | pieces["short"]) | |
765 | if pieces["dirty"]: | |
766 | rendered += ".dirty" | |
767 | return rendered | |
768 | ||
769 | ||
770 | def render_pep440_pre(pieces): | |
771 | """TAG[.post.devDISTANCE] -- No -dirty. | |
772 | ||
773 | Exceptions: | |
774 | 1: no tags. 0.post.devDISTANCE | |
775 | """ | |
776 | if pieces["closest-tag"]: | |
777 | rendered = pieces["closest-tag"] | |
778 | if pieces["distance"]: | |
779 | rendered += ".post.dev%%d" %% pieces["distance"] | |
780 | else: | |
781 | # exception #1 | |
782 | rendered = "0.post.dev%%d" %% pieces["distance"] | |
783 | return rendered | |
784 | ||
785 | ||
786 | def render_pep440_post(pieces): | |
787 | """TAG[.postDISTANCE[.dev0]+gHEX] . | |
788 | ||
789 | The ".dev0" means dirty. Note that .dev0 sorts backwards | |
790 | (a dirty tree will appear "older" than the corresponding clean one), | |
791 | but you shouldn't be releasing software with -dirty anyways. | |
792 | ||
793 | Exceptions: | |
794 | 1: no tags. 0.postDISTANCE[.dev0] | |
795 | """ | |
796 | if pieces["closest-tag"]: | |
797 | rendered = pieces["closest-tag"] | |
798 | if pieces["distance"] or pieces["dirty"]: | |
799 | rendered += ".post%%d" %% pieces["distance"] | |
800 | if pieces["dirty"]: | |
801 | rendered += ".dev0" | |
802 | rendered += plus_or_dot(pieces) | |
803 | rendered += "g%%s" %% pieces["short"] | |
804 | else: | |
805 | # exception #1 | |
806 | rendered = "0.post%%d" %% pieces["distance"] | |
807 | if pieces["dirty"]: | |
808 | rendered += ".dev0" | |
809 | rendered += "+g%%s" %% pieces["short"] | |
810 | return rendered | |
811 | ||
812 | ||
813 | def render_pep440_old(pieces): | |
814 | """TAG[.postDISTANCE[.dev0]] . | |
815 | ||
816 | The ".dev0" means dirty. | |
817 | ||
818 | Eexceptions: | |
819 | 1: no tags. 0.postDISTANCE[.dev0] | |
820 | """ | |
821 | if pieces["closest-tag"]: | |
822 | rendered = pieces["closest-tag"] | |
823 | if pieces["distance"] or pieces["dirty"]: | |
824 | rendered += ".post%%d" %% pieces["distance"] | |
825 | if pieces["dirty"]: | |
826 | rendered += ".dev0" | |
827 | else: | |
828 | # exception #1 | |
829 | rendered = "0.post%%d" %% pieces["distance"] | |
830 | if pieces["dirty"]: | |
831 | rendered += ".dev0" | |
832 | return rendered | |
833 | ||
834 | ||
835 | def render_git_describe(pieces): | |
836 | """TAG[-DISTANCE-gHEX][-dirty]. | |
837 | ||
838 | Like 'git describe --tags --dirty --always'. | |
839 | ||
840 | Exceptions: | |
841 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) | |
842 | """ | |
843 | if pieces["closest-tag"]: | |
844 | rendered = pieces["closest-tag"] | |
845 | if pieces["distance"]: | |
846 | rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) | |
847 | else: | |
848 | # exception #1 | |
849 | rendered = pieces["short"] | |
850 | if pieces["dirty"]: | |
851 | rendered += "-dirty" | |
852 | return rendered | |
853 | ||
854 | ||
855 | def render_git_describe_long(pieces): | |
856 | """TAG-DISTANCE-gHEX[-dirty]. | |
857 | ||
858 | Like 'git describe --tags --dirty --always -long'. | |
859 | The distance/hash is unconditional. | |
860 | ||
861 | Exceptions: | |
862 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) | |
863 | """ | |
864 | if pieces["closest-tag"]: | |
865 | rendered = pieces["closest-tag"] | |
866 | rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) | |
867 | else: | |
868 | # exception #1 | |
869 | rendered = pieces["short"] | |
870 | if pieces["dirty"]: | |
871 | rendered += "-dirty" | |
872 | return rendered | |
873 | ||
874 | ||
875 | def render(pieces, style): | |
876 | """Render the given version pieces into the requested style.""" | |
877 | if pieces["error"]: | |
878 | return {"version": "unknown", | |
879 | "full-revisionid": pieces.get("long"), | |
880 | "dirty": None, | |
881 | "error": pieces["error"], | |
882 | "date": None} | |
883 | ||
884 | if not style or style == "default": | |
885 | style = "pep440" # the default | |
886 | ||
887 | if style == "pep440": | |
888 | rendered = render_pep440(pieces) | |
889 | elif style == "pep440-pre": | |
890 | rendered = render_pep440_pre(pieces) | |
891 | elif style == "pep440-post": | |
892 | rendered = render_pep440_post(pieces) | |
893 | elif style == "pep440-old": | |
894 | rendered = render_pep440_old(pieces) | |
895 | elif style == "git-describe": | |
896 | rendered = render_git_describe(pieces) | |
897 | elif style == "git-describe-long": | |
898 | rendered = render_git_describe_long(pieces) | |
899 | else: | |
900 | raise ValueError("unknown style '%%s'" %% style) | |
901 | ||
902 | return {"version": rendered, "full-revisionid": pieces["long"], | |
903 | "dirty": pieces["dirty"], "error": None, | |
904 | "date": pieces.get("date")} | |
905 | ||
906 | ||
907 | def get_versions(): | |
908 | """Get version information or return default if unable to do so.""" | |
909 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have | |
910 | # __file__, we can work backwards from there to the root. Some | |
911 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which | |
912 | # case we can only use expanded keywords. | |
913 | ||
914 | cfg = get_config() | |
915 | verbose = cfg.verbose | |
916 | ||
917 | try: | |
918 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, | |
919 | verbose) | |
920 | except NotThisMethod: | |
921 | pass | |
922 | ||
923 | try: | |
924 | root = os.path.realpath(__file__) | |
925 | # versionfile_source is the relative path from the top of the source | |
926 | # tree (where the .git directory might live) to this file. Invert | |
927 | # this to find the root from __file__. | |
928 | for i in cfg.versionfile_source.split('/'): | |
929 | root = os.path.dirname(root) | |
930 | except NameError: | |
931 | return {"version": "0+unknown", "full-revisionid": None, | |
932 | "dirty": None, | |
933 | "error": "unable to find root of source tree", | |
934 | "date": None} | |
935 | ||
936 | try: | |
937 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) | |
938 | return render(pieces, cfg.style) | |
939 | except NotThisMethod: | |
940 | pass | |
941 | ||
942 | try: | |
943 | if cfg.parentdir_prefix: | |
944 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) | |
945 | except NotThisMethod: | |
946 | pass | |
947 | ||
948 | return {"version": "0+unknown", "full-revisionid": None, | |
949 | "dirty": None, | |
950 | "error": "unable to compute version", "date": None} | |
951 | ''' | |
952 | ||
953 | ||
954 | @register_vcs_handler("git", "get_keywords") | |
955 | def git_get_keywords(versionfile_abs): | |
956 | """Extract version information from the given file.""" | |
957 | # the code embedded in _version.py can just fetch the value of these | |
958 | # keywords. When used from setup.py, we don't want to import _version.py, | |
959 | # so we do it with a regexp instead. This function is not used from | |
960 | # _version.py. | |
961 | keywords = {} | |
962 | try: | |
963 | f = open(versionfile_abs, "r") | |
964 | for line in f.readlines(): | |
965 | if line.strip().startswith("git_refnames ="): | |
966 | mo = re.search(r'=\s*"(.*)"', line) | |
967 | if mo: | |
968 | keywords["refnames"] = mo.group(1) | |
969 | if line.strip().startswith("git_full ="): | |
970 | mo = re.search(r'=\s*"(.*)"', line) | |
971 | if mo: | |
972 | keywords["full"] = mo.group(1) | |
973 | if line.strip().startswith("git_date ="): | |
974 | mo = re.search(r'=\s*"(.*)"', line) | |
975 | if mo: | |
976 | keywords["date"] = mo.group(1) | |
977 | f.close() | |
978 | except EnvironmentError: | |
979 | pass | |
980 | return keywords | |
981 | ||
982 | ||
983 | @register_vcs_handler("git", "keywords") | |
984 | def git_versions_from_keywords(keywords, tag_prefix, verbose): | |
985 | """Get version information from git keywords.""" | |
986 | if not keywords: | |
987 | raise NotThisMethod("no keywords at all, weird") | |
988 | date = keywords.get("date") | |
989 | if date is not None: | |
990 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant | |
991 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 | |
992 | # -like" string, which we must then edit to make compliant), because | |
993 | # it's been around since git-1.5.3, and it's too difficult to | |
994 | # discover which version we're using, or to work around using an | |
995 | # older one. | |
996 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) | |
997 | refnames = keywords["refnames"].strip() | |
998 | if refnames.startswith("$Format"): | |
999 | if verbose: | |
1000 | print("keywords are unexpanded, not using") | |
1001 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") | |
1002 | refs = set([r.strip() for r in refnames.strip("()").split(",")]) | |
1003 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of | |
1004 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. | |
1005 | TAG = "tag: " | |
1006 | tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) | |
1007 | if not tags: | |
1008 | # Either we're using git < 1.8.3, or there really are no tags. We use | |
1009 | # a heuristic: assume all version tags have a digit. The old git %d | |
1010 | # expansion behaves like git log --decorate=short and strips out the | |
1011 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish | |
1012 | # between branches and tags. By ignoring refnames without digits, we | |
1013 | # filter out many common branch names like "release" and | |
1014 | # "stabilization", as well as "HEAD" and "master". | |
1015 | tags = set([r for r in refs if re.search(r"\d", r)]) | |
1016 | if verbose: | |
1017 | print("discarding '%s', no digits" % ",".join(refs - tags)) | |
1018 | if verbose: | |
1019 | print("likely tags: %s" % ",".join(sorted(tags))) | |
1020 | for ref in sorted(tags): | |
1021 | # sorting will prefer e.g. "2.0" over "2.0rc1" | |
1022 | if ref.startswith(tag_prefix): | |
1023 | r = ref[len(tag_prefix) :] | |
1024 | if verbose: | |
1025 | print("picking %s" % r) | |
1026 | return { | |
1027 | "version": r, | |
1028 | "full-revisionid": keywords["full"].strip(), | |
1029 | "dirty": False, | |
1030 | "error": None, | |
1031 | "date": date, | |
1032 | } | |
1033 | # no suitable tags, so version is "0+unknown", but full hex is still there | |
1034 | if verbose: | |
1035 | print("no suitable tags, using unknown + full revision id") | |
1036 | return { | |
1037 | "version": "0+unknown", | |
1038 | "full-revisionid": keywords["full"].strip(), | |
1039 | "dirty": False, | |
1040 | "error": "no suitable tags", | |
1041 | "date": None, | |
1042 | } | |
1043 | ||
1044 | ||
1045 | @register_vcs_handler("git", "pieces_from_vcs") | |
1046 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): | |
1047 | """Get version from 'git describe' in the root of the source tree. | |
1048 | ||
1049 | This only gets called if the git-archive 'subst' keywords were *not* | |
1050 | expanded, and _version.py hasn't already been rewritten with a short | |
1051 | version string, meaning we're inside a checked out source tree. | |
1052 | """ | |
1053 | GITS = ["git"] | |
1054 | if sys.platform == "win32": | |
1055 | GITS = ["git.cmd", "git.exe"] | |
1056 | ||
1057 | out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) | |
1058 | if rc != 0: | |
1059 | if verbose: | |
1060 | print("Directory %s not under git control" % root) | |
1061 | raise NotThisMethod("'git rev-parse --git-dir' returned error") | |
1062 | ||
1063 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] | |
1064 | # if there isn't one, this yields HEX[-dirty] (no NUM) | |
1065 | describe_out, rc = run_command( | |
1066 | GITS, | |
1067 | [ | |
1068 | "describe", | |
1069 | "--tags", | |
1070 | "--dirty", | |
1071 | "--always", | |
1072 | "--long", | |
1073 | "--match", | |
1074 | "%s*" % tag_prefix, | |
1075 | ], | |
1076 | cwd=root, | |
1077 | ) | |
1078 | # --long was added in git-1.5.5 | |
1079 | if describe_out is None: | |
1080 | raise NotThisMethod("'git describe' failed") | |
1081 | describe_out = describe_out.strip() | |
1082 | full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) | |
1083 | if full_out is None: | |
1084 | raise NotThisMethod("'git rev-parse' failed") | |
1085 | full_out = full_out.strip() | |
1086 | ||
1087 | pieces = {} | |
1088 | pieces["long"] = full_out | |
1089 | pieces["short"] = full_out[:7] # maybe improved later | |
1090 | pieces["error"] = None | |
1091 | ||
1092 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] | |
1093 | # TAG might have hyphens. | |
1094 | git_describe = describe_out | |
1095 | ||
1096 | # look for -dirty suffix | |
1097 | dirty = git_describe.endswith("-dirty") | |
1098 | pieces["dirty"] = dirty | |
1099 | if dirty: | |
1100 | git_describe = git_describe[: git_describe.rindex("-dirty")] | |
1101 | ||
1102 | # now we have TAG-NUM-gHEX or HEX | |
1103 | ||
1104 | if "-" in git_describe: | |
1105 | # TAG-NUM-gHEX | |
1106 | mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) | |
1107 | if not mo: | |
1108 | # unparseable. Maybe git-describe is misbehaving? | |
1109 | pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out | |
1110 | return pieces | |
1111 | ||
1112 | # tag | |
1113 | full_tag = mo.group(1) | |
1114 | if not full_tag.startswith(tag_prefix): | |
1115 | if verbose: | |
1116 | fmt = "tag '%s' doesn't start with prefix '%s'" | |
1117 | print(fmt % (full_tag, tag_prefix)) | |
1118 | pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( | |
1119 | full_tag, | |
1120 | tag_prefix, | |
1121 | ) | |
1122 | return pieces | |
1123 | pieces["closest-tag"] = full_tag[len(tag_prefix) :] | |
1124 | ||
1125 | # distance: number of commits since tag | |
1126 | pieces["distance"] = int(mo.group(2)) | |
1127 | ||
1128 | # commit: short hex revision ID | |
1129 | pieces["short"] = mo.group(3) | |
1130 | ||
1131 | else: | |
1132 | # HEX: no tags | |
1133 | pieces["closest-tag"] = None | |
1134 | count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) | |
1135 | pieces["distance"] = int(count_out) # total number of commits | |
1136 | ||
1137 | # commit date: see ISO-8601 comment in git_versions_from_keywords() | |
1138 | date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ | |
1139 | 0 | |
1140 | ].strip() | |
1141 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) | |
1142 | ||
1143 | return pieces | |
1144 | ||
1145 | ||
1146 | def do_vcs_install(manifest_in, versionfile_source, ipy): | |
1147 | """Git-specific installation logic for Versioneer. | |
1148 | ||
1149 | For Git, this means creating/changing .gitattributes to mark _version.py | |
1150 | for export-subst keyword substitution. | |
1151 | """ | |
1152 | GITS = ["git"] | |
1153 | if sys.platform == "win32": | |
1154 | GITS = ["git.cmd", "git.exe"] | |
1155 | files = [manifest_in, versionfile_source] | |
1156 | if ipy: | |
1157 | files.append(ipy) | |
1158 | try: | |
1159 | me = __file__ | |
1160 | if me.endswith(".pyc") or me.endswith(".pyo"): | |
1161 | me = os.path.splitext(me)[0] + ".py" | |
1162 | versioneer_file = os.path.relpath(me) | |
1163 | except NameError: | |
1164 | versioneer_file = "versioneer.py" | |
1165 | files.append(versioneer_file) | |
1166 | present = False | |
1167 | try: | |
1168 | f = open(".gitattributes", "r") | |
1169 | for line in f.readlines(): | |
1170 | if line.strip().startswith(versionfile_source): | |
1171 | if "export-subst" in line.strip().split()[1:]: | |
1172 | present = True | |
1173 | f.close() | |
1174 | except EnvironmentError: | |
1175 | pass | |
1176 | if not present: | |
1177 | f = open(".gitattributes", "a+") | |
1178 | f.write("%s export-subst\n" % versionfile_source) | |
1179 | f.close() | |
1180 | files.append(".gitattributes") | |
1181 | run_command(GITS, ["add", "--"] + files) | |
1182 | ||
1183 | ||
1184 | def versions_from_parentdir(parentdir_prefix, root, verbose): | |
1185 | """Try to determine the version from the parent directory name. | |
1186 | ||
1187 | Source tarballs conventionally unpack into a directory that includes both | |
1188 | the project name and a version string. We will also support searching up | |
1189 | two directory levels for an appropriately named parent directory | |
1190 | """ | |
1191 | rootdirs = [] | |
1192 | ||
1193 | for i in range(3): | |
1194 | dirname = os.path.basename(root) | |
1195 | if dirname.startswith(parentdir_prefix): | |
1196 | return { | |
1197 | "version": dirname[len(parentdir_prefix) :], | |
1198 | "full-revisionid": None, | |
1199 | "dirty": False, | |
1200 | "error": None, | |
1201 | "date": None, | |
1202 | } | |
1203 | else: | |
1204 | rootdirs.append(root) | |
1205 | root = os.path.dirname(root) # up a level | |
1206 | ||
1207 | if verbose: | |
1208 | print( | |
1209 | "Tried directories %s but none started with prefix %s" | |
1210 | % (str(rootdirs), parentdir_prefix) | |
1211 | ) | |
1212 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") | |
1213 | ||
1214 | ||
1215 | SHORT_VERSION_PY = """ | |
1216 | # This file was generated by 'versioneer.py' (0.18) from | |
1217 | # revision-control system data, or from the parent directory name of an | |
1218 | # unpacked source archive. Distribution tarballs contain a pre-generated copy | |
1219 | # of this file. | |
1220 | ||
1221 | import json | |
1222 | ||
1223 | version_json = ''' | |
1224 | %s | |
1225 | ''' # END VERSION_JSON | |
1226 | ||
1227 | ||
1228 | def get_versions(): | |
1229 | return json.loads(version_json) | |
1230 | """ | |
1231 | ||
1232 | ||
1233 | def versions_from_file(filename): | |
1234 | """Try to determine the version from _version.py if present.""" | |
1235 | try: | |
1236 | with open(filename) as f: | |
1237 | contents = f.read() | |
1238 | except EnvironmentError: | |
1239 | raise NotThisMethod("unable to read _version.py") | |
1240 | mo = re.search( | |
1241 | r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S | |
1242 | ) | |
1243 | if not mo: | |
1244 | mo = re.search( | |
1245 | r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S | |
1246 | ) | |
1247 | if not mo: | |
1248 | raise NotThisMethod("no version_json in _version.py") | |
1249 | return json.loads(mo.group(1)) | |
1250 | ||
1251 | ||
1252 | def write_to_version_file(filename, versions): | |
1253 | """Write the given version number to the given _version.py file.""" | |
1254 | os.unlink(filename) | |
1255 | contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) | |
1256 | with open(filename, "w") as f: | |
1257 | f.write(SHORT_VERSION_PY % contents) | |
1258 | ||
1259 | print("set %s to '%s'" % (filename, versions["version"])) | |
1260 | ||
1261 | ||
1262 | def plus_or_dot(pieces): | |
1263 | """Return a + if we don't already have one, else return a .""" | |
1264 | if "+" in pieces.get("closest-tag", ""): | |
1265 | return "." | |
1266 | return "+" | |
1267 | ||
1268 | ||
1269 | def render_pep440(pieces): | |
1270 | """Build up version string, with post-release "local version identifier". | |
1271 | ||
1272 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you | |
1273 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty | |
1274 | ||
1275 | Exceptions: | |
1276 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] | |
1277 | """ | |
1278 | if pieces["closest-tag"]: | |
1279 | rendered = pieces["closest-tag"] | |
1280 | if pieces["distance"] or pieces["dirty"]: | |
1281 | rendered += plus_or_dot(pieces) | |
1282 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) | |
1283 | if pieces["dirty"]: | |
1284 | rendered += ".dirty" | |
1285 | else: | |
1286 | # exception #1 | |
1287 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) | |
1288 | if pieces["dirty"]: | |
1289 | rendered += ".dirty" | |
1290 | return rendered | |
1291 | ||
1292 | ||
1293 | def render_pep440_pre(pieces): | |
1294 | """TAG[.post.devDISTANCE] -- No -dirty. | |
1295 | ||
1296 | Exceptions: | |
1297 | 1: no tags. 0.post.devDISTANCE | |
1298 | """ | |
1299 | if pieces["closest-tag"]: | |
1300 | rendered = pieces["closest-tag"] | |
1301 | if pieces["distance"]: | |
1302 | rendered += ".post.dev%d" % pieces["distance"] | |
1303 | else: | |
1304 | # exception #1 | |
1305 | rendered = "0.post.dev%d" % pieces["distance"] | |
1306 | return rendered | |
1307 | ||
1308 | ||
1309 | def render_pep440_post(pieces): | |
1310 | """TAG[.postDISTANCE[.dev0]+gHEX] . | |
1311 | ||
1312 | The ".dev0" means dirty. Note that .dev0 sorts backwards | |
1313 | (a dirty tree will appear "older" than the corresponding clean one), | |
1314 | but you shouldn't be releasing software with -dirty anyways. | |
1315 | ||
1316 | Exceptions: | |
1317 | 1: no tags. 0.postDISTANCE[.dev0] | |
1318 | """ | |
1319 | if pieces["closest-tag"]: | |
1320 | rendered = pieces["closest-tag"] | |
1321 | if pieces["distance"] or pieces["dirty"]: | |
1322 | rendered += ".post%d" % pieces["distance"] | |
1323 | if pieces["dirty"]: | |
1324 | rendered += ".dev0" | |
1325 | rendered += plus_or_dot(pieces) | |
1326 | rendered += "g%s" % pieces["short"] | |
1327 | else: | |
1328 | # exception #1 | |
1329 | rendered = "0.post%d" % pieces["distance"] | |
1330 | if pieces["dirty"]: | |
1331 | rendered += ".dev0" | |
1332 | rendered += "+g%s" % pieces["short"] | |
1333 | return rendered | |
1334 | ||
1335 | ||
1336 | def render_pep440_old(pieces): | |
1337 | """TAG[.postDISTANCE[.dev0]] . | |
1338 | ||
1339 | The ".dev0" means dirty. | |
1340 | ||
1341 | Eexceptions: | |
1342 | 1: no tags. 0.postDISTANCE[.dev0] | |
1343 | """ | |
1344 | if pieces["closest-tag"]: | |
1345 | rendered = pieces["closest-tag"] | |
1346 | if pieces["distance"] or pieces["dirty"]: | |
1347 | rendered += ".post%d" % pieces["distance"] | |
1348 | if pieces["dirty"]: | |
1349 | rendered += ".dev0" | |
1350 | else: | |
1351 | # exception #1 | |
1352 | rendered = "0.post%d" % pieces["distance"] | |
1353 | if pieces["dirty"]: | |
1354 | rendered += ".dev0" | |
1355 | return rendered | |
1356 | ||
1357 | ||
1358 | def render_git_describe(pieces): | |
1359 | """TAG[-DISTANCE-gHEX][-dirty]. | |
1360 | ||
1361 | Like 'git describe --tags --dirty --always'. | |
1362 | ||
1363 | Exceptions: | |
1364 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) | |
1365 | """ | |
1366 | if pieces["closest-tag"]: | |
1367 | rendered = pieces["closest-tag"] | |
1368 | if pieces["distance"]: | |
1369 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) | |
1370 | else: | |
1371 | # exception #1 | |
1372 | rendered = pieces["short"] | |
1373 | if pieces["dirty"]: | |
1374 | rendered += "-dirty" | |
1375 | return rendered | |
1376 | ||
1377 | ||
1378 | def render_git_describe_long(pieces): | |
1379 | """TAG-DISTANCE-gHEX[-dirty]. | |
1380 | ||
1381 | Like 'git describe --tags --dirty --always -long'. | |
1382 | The distance/hash is unconditional. | |
1383 | ||
1384 | Exceptions: | |
1385 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) | |
1386 | """ | |
1387 | if pieces["closest-tag"]: | |
1388 | rendered = pieces["closest-tag"] | |
1389 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) | |
1390 | else: | |
1391 | # exception #1 | |
1392 | rendered = pieces["short"] | |
1393 | if pieces["dirty"]: | |
1394 | rendered += "-dirty" | |
1395 | return rendered | |
1396 | ||
1397 | ||
1398 | def render(pieces, style): | |
1399 | """Render the given version pieces into the requested style.""" | |
1400 | if pieces["error"]: | |
1401 | return { | |
1402 | "version": "unknown", | |
1403 | "full-revisionid": pieces.get("long"), | |
1404 | "dirty": None, | |
1405 | "error": pieces["error"], | |
1406 | "date": None, | |
1407 | } | |
1408 | ||
1409 | if not style or style == "default": | |
1410 | style = "pep440" # the default | |
1411 | ||
1412 | if style == "pep440": | |
1413 | rendered = render_pep440(pieces) | |
1414 | elif style == "pep440-pre": | |
1415 | rendered = render_pep440_pre(pieces) | |
1416 | elif style == "pep440-post": | |
1417 | rendered = render_pep440_post(pieces) | |
1418 | elif style == "pep440-old": | |
1419 | rendered = render_pep440_old(pieces) | |
1420 | elif style == "git-describe": | |
1421 | rendered = render_git_describe(pieces) | |
1422 | elif style == "git-describe-long": | |
1423 | rendered = render_git_describe_long(pieces) | |
1424 | else: | |
1425 | raise ValueError("unknown style '%s'" % style) | |
1426 | ||
1427 | return { | |
1428 | "version": rendered, | |
1429 | "full-revisionid": pieces["long"], | |
1430 | "dirty": pieces["dirty"], | |
1431 | "error": None, | |
1432 | "date": pieces.get("date"), | |
1433 | } | |
1434 | ||
1435 | ||
1436 | class VersioneerBadRootError(Exception): | |
1437 | """The project root directory is unknown or missing key files.""" | |
1438 | ||
1439 | ||
1440 | def get_versions(verbose=False): | |
1441 | """Get the project version from whatever source is available. | |
1442 | ||
1443 | Returns dict with two keys: 'version' and 'full'. | |
1444 | """ | |
1445 | if "versioneer" in sys.modules: | |
1446 | # see the discussion in cmdclass.py:get_cmdclass() | |
1447 | del sys.modules["versioneer"] | |
1448 | ||
1449 | root = get_root() | |
1450 | cfg = get_config_from_root(root) | |
1451 | ||
1452 | assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" | |
1453 | handlers = HANDLERS.get(cfg.VCS) | |
1454 | assert handlers, "unrecognized VCS '%s'" % cfg.VCS | |
1455 | verbose = verbose or cfg.verbose | |
1456 | assert ( | |
1457 | cfg.versionfile_source is not None | |
1458 | ), "please set versioneer.versionfile_source" | |
1459 | assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" | |
1460 | ||
1461 | versionfile_abs = os.path.join(root, cfg.versionfile_source) | |
1462 | ||
1463 | # extract version from first of: _version.py, VCS command (e.g. 'git | |
1464 | # describe'), parentdir. This is meant to work for developers using a | |
1465 | # source checkout, for users of a tarball created by 'setup.py sdist', | |
1466 | # and for users of a tarball/zipball created by 'git archive' or github's | |
1467 | # download-from-tag feature or the equivalent in other VCSes. | |
1468 | ||
1469 | get_keywords_f = handlers.get("get_keywords") | |
1470 | from_keywords_f = handlers.get("keywords") | |
1471 | if get_keywords_f and from_keywords_f: | |
1472 | try: | |
1473 | keywords = get_keywords_f(versionfile_abs) | |
1474 | ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) | |
1475 | if verbose: | |
1476 | print("got version from expanded keyword %s" % ver) | |
1477 | return ver | |
1478 | except NotThisMethod: | |
1479 | pass | |
1480 | ||
1481 | try: | |
1482 | ver = versions_from_file(versionfile_abs) | |
1483 | if verbose: | |
1484 | print("got version from file %s %s" % (versionfile_abs, ver)) | |
1485 | return ver | |
1486 | except NotThisMethod: | |
1487 | pass | |
1488 | ||
1489 | from_vcs_f = handlers.get("pieces_from_vcs") | |
1490 | if from_vcs_f: | |
1491 | try: | |
1492 | pieces = from_vcs_f(cfg.tag_prefix, root, verbose) | |
1493 | ver = render(pieces, cfg.style) | |
1494 | if verbose: | |
1495 | print("got version from VCS %s" % ver) | |
1496 | return ver | |
1497 | except NotThisMethod: | |
1498 | pass | |
1499 | ||
1500 | try: | |
1501 | if cfg.parentdir_prefix: | |
1502 | ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) | |
1503 | if verbose: | |
1504 | print("got version from parentdir %s" % ver) | |
1505 | return ver | |
1506 | except NotThisMethod: | |
1507 | pass | |
1508 | ||
1509 | if verbose: | |
1510 | print("unable to compute version") | |
1511 | ||
1512 | return { | |
1513 | "version": "0+unknown", | |
1514 | "full-revisionid": None, | |
1515 | "dirty": None, | |
1516 | "error": "unable to compute version", | |
1517 | "date": None, | |
1518 | } | |
1519 | ||
1520 | ||
1521 | def get_version(): | |
1522 | """Get the short version string for this project.""" | |
1523 | return get_versions()["version"] | |
1524 | ||
1525 | ||
1526 | def get_cmdclass(): | |
1527 | """Get the custom setuptools/distutils subclasses used by Versioneer.""" | |
1528 | if "versioneer" in sys.modules: | |
1529 | del sys.modules["versioneer"] | |
1530 | # this fixes the "python setup.py develop" case (also 'install' and | |
1531 | # 'easy_install .'), in which subdependencies of the main project are | |
1532 | # built (using setup.py bdist_egg) in the same python process. Assume | |
1533 | # a main project A and a dependency B, which use different versions | |
1534 | # of Versioneer. A's setup.py imports A's Versioneer, leaving it in | |
1535 | # sys.modules by the time B's setup.py is executed, causing B to run | |
1536 | # with the wrong versioneer. Setuptools wraps the sub-dep builds in a | |
1537 | # sandbox that restores sys.modules to it's pre-build state, so the | |
1538 | # parent is protected against the child's "import versioneer". By | |
1539 | # removing ourselves from sys.modules here, before the child build | |
1540 | # happens, we protect the child from the parent's versioneer too. | |
1541 | # Also see https://github.com/warner/python-versioneer/issues/52 | |
1542 | ||
1543 | cmds = {} | |
1544 | ||
1545 | # we add "version" to both distutils and setuptools | |
1546 | from distutils.core import Command | |
1547 | ||
1548 | class cmd_version(Command): | |
1549 | description = "report generated version string" | |
1550 | user_options = [] | |
1551 | boolean_options = [] | |
1552 | ||
1553 | def initialize_options(self): | |
1554 | pass | |
1555 | ||
1556 | def finalize_options(self): | |
1557 | pass | |
1558 | ||
1559 | def run(self): | |
1560 | vers = get_versions(verbose=True) | |
1561 | print("Version: %s" % vers["version"]) | |
1562 | print(" full-revisionid: %s" % vers.get("full-revisionid")) | |
1563 | print(" dirty: %s" % vers.get("dirty")) | |
1564 | print(" date: %s" % vers.get("date")) | |
1565 | if vers["error"]: | |
1566 | print(" error: %s" % vers["error"]) | |
1567 | ||
1568 | cmds["version"] = cmd_version | |
1569 | ||
1570 | # we override "build_py" in both distutils and setuptools | |
1571 | # | |
1572 | # most invocation pathways end up running build_py: | |
1573 | # distutils/build -> build_py | |
1574 | # distutils/install -> distutils/build ->.. | |
1575 | # setuptools/bdist_wheel -> distutils/install ->.. | |
1576 | # setuptools/bdist_egg -> distutils/install_lib -> build_py | |
1577 | # setuptools/install -> bdist_egg ->.. | |
1578 | # setuptools/develop -> ? | |
1579 | # pip install: | |
1580 | # copies source tree to a tempdir before running egg_info/etc | |
1581 | # if .git isn't copied too, 'git describe' will fail | |
1582 | # then does setup.py bdist_wheel, or sometimes setup.py install | |
1583 | # setup.py egg_info -> ? | |
1584 | ||
1585 | # we override different "build_py" commands for both environments | |
1586 | if "setuptools" in sys.modules: | |
1587 | from setuptools.command.build_py import build_py as _build_py | |
1588 | else: | |
1589 | from distutils.command.build_py import build_py as _build_py | |
1590 | ||
1591 | class cmd_build_py(_build_py): | |
1592 | def run(self): | |
1593 | root = get_root() | |
1594 | cfg = get_config_from_root(root) | |
1595 | versions = get_versions() | |
1596 | _build_py.run(self) | |
1597 | # now locate _version.py in the new build/ directory and replace | |
1598 | # it with an updated value | |
1599 | if cfg.versionfile_build: | |
1600 | target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) | |
1601 | print("UPDATING %s" % target_versionfile) | |
1602 | write_to_version_file(target_versionfile, versions) | |
1603 | ||
1604 | cmds["build_py"] = cmd_build_py | |
1605 | ||
1606 | if "cx_Freeze" in sys.modules: # cx_freeze enabled? | |
1607 | from cx_Freeze.dist import build_exe as _build_exe | |
1608 | ||
1609 | # nczeczulin reports that py2exe won't like the pep440-style string | |
1610 | # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. | |
1611 | # setup(console=[{ | |
1612 | # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION | |
1613 | # "product_version": versioneer.get_version(), | |
1614 | # ... | |
1615 | ||
1616 | class cmd_build_exe(_build_exe): | |
1617 | def run(self): | |
1618 | root = get_root() | |
1619 | cfg = get_config_from_root(root) | |
1620 | versions = get_versions() | |
1621 | target_versionfile = cfg.versionfile_source | |
1622 | print("UPDATING %s" % target_versionfile) | |
1623 | write_to_version_file(target_versionfile, versions) | |
1624 | ||
1625 | _build_exe.run(self) | |
1626 | os.unlink(target_versionfile) | |
1627 | with open(cfg.versionfile_source, "w") as f: | |
1628 | LONG = LONG_VERSION_PY[cfg.VCS] | |
1629 | f.write( | |
1630 | LONG | |
1631 | % { | |
1632 | "DOLLAR": "$", | |
1633 | "STYLE": cfg.style, | |
1634 | "TAG_PREFIX": cfg.tag_prefix, | |
1635 | "PARENTDIR_PREFIX": cfg.parentdir_prefix, | |
1636 | "VERSIONFILE_SOURCE": cfg.versionfile_source, | |
1637 | } | |
1638 | ) | |
1639 | ||
1640 | cmds["build_exe"] = cmd_build_exe | |
1641 | del cmds["build_py"] | |
1642 | ||
1643 | if "py2exe" in sys.modules: # py2exe enabled? | |
1644 | try: | |
1645 | from py2exe.distutils_buildexe import py2exe as _py2exe # py3 | |
1646 | except ImportError: | |
1647 | from py2exe.build_exe import py2exe as _py2exe # py2 | |
1648 | ||
1649 | class cmd_py2exe(_py2exe): | |
1650 | def run(self): | |
1651 | root = get_root() | |
1652 | cfg = get_config_from_root(root) | |
1653 | versions = get_versions() | |
1654 | target_versionfile = cfg.versionfile_source | |
1655 | print("UPDATING %s" % target_versionfile) | |
1656 | write_to_version_file(target_versionfile, versions) | |
1657 | ||
1658 | _py2exe.run(self) | |
1659 | os.unlink(target_versionfile) | |
1660 | with open(cfg.versionfile_source, "w") as f: | |
1661 | LONG = LONG_VERSION_PY[cfg.VCS] | |
1662 | f.write( | |
1663 | LONG | |
1664 | % { | |
1665 | "DOLLAR": "$", | |
1666 | "STYLE": cfg.style, | |
1667 | "TAG_PREFIX": cfg.tag_prefix, | |
1668 | "PARENTDIR_PREFIX": cfg.parentdir_prefix, | |
1669 | "VERSIONFILE_SOURCE": cfg.versionfile_source, | |
1670 | } | |
1671 | ) | |
1672 | ||
1673 | cmds["py2exe"] = cmd_py2exe | |
1674 | ||
1675 | # we override different "sdist" commands for both environments | |
1676 | if "setuptools" in sys.modules: | |
1677 | from setuptools.command.sdist import sdist as _sdist | |
1678 | else: | |
1679 | from distutils.command.sdist import sdist as _sdist | |
1680 | ||
1681 | class cmd_sdist(_sdist): | |
1682 | def run(self): | |
1683 | versions = get_versions() | |
1684 | self._versioneer_generated_versions = versions | |
1685 | # unless we update this, the command will keep using the old | |
1686 | # version | |
1687 | self.distribution.metadata.version = versions["version"] | |
1688 | return _sdist.run(self) | |
1689 | ||
1690 | def make_release_tree(self, base_dir, files): | |
1691 | root = get_root() | |
1692 | cfg = get_config_from_root(root) | |
1693 | _sdist.make_release_tree(self, base_dir, files) | |
1694 | # now locate _version.py in the new base_dir directory | |
1695 | # (remembering that it may be a hardlink) and replace it with an | |
1696 | # updated value | |
1697 | target_versionfile = os.path.join(base_dir, cfg.versionfile_source) | |
1698 | print("UPDATING %s" % target_versionfile) | |
1699 | write_to_version_file( | |
1700 | target_versionfile, self._versioneer_generated_versions | |
1701 | ) | |
1702 | ||
1703 | cmds["sdist"] = cmd_sdist | |
1704 | ||
1705 | return cmds | |
1706 | ||
1707 | ||
1708 | CONFIG_ERROR = """ | |
1709 | setup.cfg is missing the necessary Versioneer configuration. You need | |
1710 | a section like: | |
1711 | ||
1712 | [versioneer] | |
1713 | VCS = git | |
1714 | style = pep440 | |
1715 | versionfile_source = src/myproject/_version.py | |
1716 | versionfile_build = myproject/_version.py | |
1717 | tag_prefix = | |
1718 | parentdir_prefix = myproject- | |
1719 | ||
1720 | You will also need to edit your setup.py to use the results: | |
1721 | ||
1722 | import versioneer | |
1723 | setup(version=versioneer.get_version(), | |
1724 | cmdclass=versioneer.get_cmdclass(), ...) | |
1725 | ||
1726 | Please read the docstring in ./versioneer.py for configuration instructions, | |
1727 | edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. | |
1728 | """ | |
1729 | ||
1730 | SAMPLE_CONFIG = """ | |
1731 | # See the docstring in versioneer.py for instructions. Note that you must | |
1732 | # re-run 'versioneer.py setup' after changing this section, and commit the | |
1733 | # resulting files. | |
1734 | ||
1735 | [versioneer] | |
1736 | #VCS = git | |
1737 | #style = pep440 | |
1738 | #versionfile_source = | |
1739 | #versionfile_build = | |
1740 | #tag_prefix = | |
1741 | #parentdir_prefix = | |
1742 | ||
1743 | """ | |
1744 | ||
1745 | INIT_PY_SNIPPET = """ | |
1746 | from ._version import get_versions | |
1747 | __version__ = get_versions()['version'] | |
1748 | del get_versions | |
1749 | """ | |
1750 | ||
1751 | ||
1752 | def do_setup(): | |
1753 | """Main VCS-independent setup function for installing Versioneer.""" | |
1754 | root = get_root() | |
1755 | try: | |
1756 | cfg = get_config_from_root(root) | |
1757 | except ( | |
1758 | EnvironmentError, | |
1759 | configparser.NoSectionError, | |
1760 | configparser.NoOptionError, | |
1761 | ) as e: | |
1762 | if isinstance(e, (EnvironmentError, configparser.NoSectionError)): | |
1763 | print("Adding sample versioneer config to setup.cfg", file=sys.stderr) | |
1764 | with open(os.path.join(root, "setup.cfg"), "a") as f: | |
1765 | f.write(SAMPLE_CONFIG) | |
1766 | print(CONFIG_ERROR, file=sys.stderr) | |
1767 | return 1 | |
1768 | ||
1769 | print(" creating %s" % cfg.versionfile_source) | |
1770 | with open(cfg.versionfile_source, "w") as f: | |
1771 | LONG = LONG_VERSION_PY[cfg.VCS] | |
1772 | f.write( | |
1773 | LONG | |
1774 | % { | |
1775 | "DOLLAR": "$", | |
1776 | "STYLE": cfg.style, | |
1777 | "TAG_PREFIX": cfg.tag_prefix, | |
1778 | "PARENTDIR_PREFIX": cfg.parentdir_prefix, | |
1779 | "VERSIONFILE_SOURCE": cfg.versionfile_source, | |
1780 | } | |
1781 | ) | |
1782 | ||
1783 | ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") | |
1784 | if os.path.exists(ipy): | |
1785 | try: | |
1786 | with open(ipy, "r") as f: | |
1787 | old = f.read() | |
1788 | except EnvironmentError: | |
1789 | old = "" | |
1790 | if INIT_PY_SNIPPET not in old: | |
1791 | print(" appending to %s" % ipy) | |
1792 | with open(ipy, "a") as f: | |
1793 | f.write(INIT_PY_SNIPPET) | |
1794 | else: | |
1795 | print(" %s unmodified" % ipy) | |
1796 | else: | |
1797 | print(" %s doesn't exist, ok" % ipy) | |
1798 | ipy = None | |
1799 | ||
1800 | # Make sure both the top-level "versioneer.py" and versionfile_source | |
1801 | # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so | |
1802 | # they'll be copied into source distributions. Pip won't be able to | |
1803 | # install the package without this. | |
1804 | manifest_in = os.path.join(root, "MANIFEST.in") | |
1805 | simple_includes = set() | |
1806 | try: | |
1807 | with open(manifest_in, "r") as f: | |
1808 | for line in f: | |
1809 | if line.startswith("include "): | |
1810 | for include in line.split()[1:]: | |
1811 | simple_includes.add(include) | |
1812 | except EnvironmentError: | |
1813 | pass | |
1814 | # That doesn't cover everything MANIFEST.in can do | |
1815 | # (http://docs.python.org/2/distutils/sourcedist.html#commands), so | |
1816 | # it might give some false negatives. Appending redundant 'include' | |
1817 | # lines is safe, though. | |
1818 | if "versioneer.py" not in simple_includes: | |
1819 | print(" appending 'versioneer.py' to MANIFEST.in") | |
1820 | with open(manifest_in, "a") as f: | |
1821 | f.write("include versioneer.py\n") | |
1822 | else: | |
1823 | print(" 'versioneer.py' already in MANIFEST.in") | |
1824 | if cfg.versionfile_source not in simple_includes: | |
1825 | print( | |
1826 | " appending versionfile_source ('%s') to MANIFEST.in" | |
1827 | % cfg.versionfile_source | |
1828 | ) | |
1829 | with open(manifest_in, "a") as f: | |
1830 | f.write("include %s\n" % cfg.versionfile_source) | |
1831 | else: | |
1832 | print(" versionfile_source already in MANIFEST.in") | |
1833 | ||
1834 | # Make VCS-specific changes. For git, this means creating/changing | |
1835 | # .gitattributes to mark _version.py for export-subst keyword | |
1836 | # substitution. | |
1837 | do_vcs_install(manifest_in, cfg.versionfile_source, ipy) | |
1838 | return 0 | |
1839 | ||
1840 | ||
1841 | def scan_setup_py(): | |
1842 | """Validate the contents of setup.py against Versioneer's expectations.""" | |
1843 | found = set() | |
1844 | setters = False | |
1845 | errors = 0 | |
1846 | with open("setup.py", "r") as f: | |
1847 | for line in f.readlines(): | |
1848 | if "import versioneer" in line: | |
1849 | found.add("import") | |
1850 | if "versioneer.get_cmdclass()" in line: | |
1851 | found.add("cmdclass") | |
1852 | if "versioneer.get_version()" in line: | |
1853 | found.add("get_version") | |
1854 | if "versioneer.VCS" in line: | |
1855 | setters = True | |
1856 | if "versioneer.versionfile_source" in line: | |
1857 | setters = True | |
1858 | if len(found) != 3: | |
1859 | print("") | |
1860 | print("Your setup.py appears to be missing some important items") | |
1861 | print("(but I might be wrong). Please make sure it has something") | |
1862 | print("roughly like the following:") | |
1863 | print("") | |
1864 | print(" import versioneer") | |
1865 | print(" setup( version=versioneer.get_version(),") | |
1866 | print(" cmdclass=versioneer.get_cmdclass(), ...)") | |
1867 | print("") | |
1868 | errors += 1 | |
1869 | if setters: | |
1870 | print("You should remove lines like 'versioneer.VCS = ' and") | |
1871 | print("'versioneer.versionfile_source = ' . This configuration") | |
1872 | print("now lives in setup.cfg, and should be removed from setup.py") | |
1873 | print("") | |
1874 | errors += 1 | |
1875 | return errors | |
1876 | ||
1877 | ||
1878 | if __name__ == "__main__": | |
1879 | cmd = sys.argv[1] | |
1880 | if cmd == "setup": | |
1881 | errors = do_setup() | |
1882 | errors += scan_setup_py() | |
1883 | if errors: | |
1884 | sys.exit(1) |