Codebase list fsspec / f7aa29e
New upstream version 0.6.0 eamanu 4 years ago
72 changed file(s) with 11736 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 [run]
1 omit =
2 */test_*.py
3 fsspec/_version.py
4 source =
5 fsspec
6
7 [report]
8 # Regexes for lines to exclude from consideration
9 exclude_lines =
10 pragma: no cover
11
12 raise AssertionError
13 raise NotImplementedError
14 pass
15
16 ignore_errors = True
0 fsspec/_version.py export-subst
0 # Dask
1 dask-worker-space
2
3 # Byte-compiled / optimized / DLL files
4 __pycache__/
5 *.py[cod]
6 *$py.class
7
8 # C extensions
9 *.so
10
11 # Distribution / packaging
12 .Python
13 env/
14 build/
15 develop-eggs/
16 dist/
17 downloads/
18 eggs/
19 .eggs/
20 lib/
21 lib64/
22 parts/
23 sdist/
24 var/
25 wheels/
26 *.egg-info/
27 .installed.cfg
28 *.egg
29 pip-wheel-metadata/
30
31 # PyInstaller
32 # Usually these files are written by a python script from a template
33 # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 *.manifest
35 *.spec
36
37 # Installer logs
38 pip-log.txt
39 pip-delete-this-directory.txt
40
41 # Unit test / coverage reports
42 htmlcov/
43 .tox/
44 .coverage
45 .coverage.*
46 .cache
47 nosetests.xml
48 coverage.xml
49 *.cover
50 .hypothesis/
51
52 # Translations
53 *.mo
54 *.pot
55
56 # Django stuff:
57 *.log
58 local_settings.py
59
60 # Flask stuff:
61 instance/
62 .webassets-cache
63
64 # Scrapy stuff:
65 .scrapy
66
67 # Sphinx documentation
68 docs/_build/
69
70 # PyBuilder
71 target/
72
73 # Jupyter Notebook
74 .ipynb_checkpoints
75
76 # pyenv
77 .python-version
78
79 # celery beat schedule file
80 celerybeat-schedule
81
82 # SageMath parsed files
83 *.sage.py
84
85 # dotenv
86 .env
87
88 # virtualenv
89 .venv
90 venv/
91 ENV/
92 .idea/
93
94 # Spyder project settings
95 .spyderproject
96 .spyproject
97
98 # Rope project settings
99 .ropeproject
100
101 # mkdocs documentation
102 /site
103
104 # mypy
105 .mypy_cache/
0 exclude: >
1 (?x)^(
2 \.tox/.*
3 )$
4 default_language_version:
5 python: python3.7
6 repos:
7 - repo: local
8 hooks:
9 - id: black
10 name: black
11 entry: black
12 language: python
13 require_serial: true
14 types: [python]
15 - repo: https://github.com/pre-commit/pre-commit-hooks
16 rev: v2.3.0
17 hooks:
18 - id: flake8
0 sudo: required
1 dist: xenial
2 os:
3 - linux
4 services:
5 - docker
6
7 language: generic
8 env:
9 - TOXENV=py35
10 - TOXENV=py36
11 - TOXENV=py37
12 - TOXENV=coverage
13 - TOXENV=lint
14 - TOXENV=s3fs
15 - TOXENV=gcsfs
16 install:
17 - source ci/install.sh
18 script:
19 - tox -v
20
21 notifications:
22 email: false
0 BSD 3-Clause License
1
2 Copyright (c) 2018, Martin Durant
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice, this
9 list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 * Neither the name of the copyright holder nor the names of its
16 contributors may be used to endorse or promote products derived from
17 this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0 include versioneer.py
1 include fsspec/_version.py
2
3 include LICENSE
4 include README.rst
5 include requirements.txt
0 # filesystem_spec
1
2 [![Build Status](https://travis-ci.org/intake/filesystem_spec.svg?branch=master)](https://travis-ci.org/martindurant/filesystem_spec)
3 [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
4
5 A specification for pythonic filesystems.
6
7 ## Install
8
9 ```bash
10 pip install fsspec
11 ```
12 or
13 ```bash
14 conda install -c conda-forge fsspec
15 ```
16
17 ## Purpose
18
19 To produce a template or specification for a file-system interface, that specific implementations should follow,
20 so that applications making use of them can rely on a common behaviour and not have to worry about the specific
21 internal implementation decisions with any given backend. Many such implementations are included in this package,
22 or in sister projects such as `s3fs` and `gcsfs`.
23
24 In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE
25 mounting of the file-system implementation may be available for all implementations "for free".
26
27 ## Documentation
28
29 Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
30
31 ## Develop
32
33 fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and
34 [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test
35 environments. First, install conda with tox and tox-conda in a base environment
36 (eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be
37 used to configure a development environment and run tests.
38
39 First, setup a development conda environment via `tox -e dev`. This will
40 install fspec dependencies, test & dev tools, and install fsspec in develop
41 mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`.
42
43 ### Testing
44
45 Tests can be run directly in the activated dev environment via `pytest fsspec`.
46
47 The full fsspec test suite can be run via `tox`, which will setup and execute
48 tests against multiple dependency versions in isolated environment. Run `tox
49 -av` to list available test environments, select environments via `tox -e <env>`.
50
51 The full fsspec suite requires a system-level docker, docker-compose, and fuse
52 installation. See `ci/install.sh` for a detailed installation example.
53
54 ### Code Formatting
55
56 fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure
57 a consistent code format throughout the project. ``black`` is automatically
58 installed in the tox dev env, activated via `conda activate .tox/dev`.
59
60 Then, run `black fsspec` from the root of the filesystem_spec repository to
61 auto-format your code. Additionally, many editors have plugins that will apply
62 `black` as you edit files.
63
64 Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to
65 automatically run `black` when you make a git commit. ``black`` is automatically
66 installed in the tox dev env, activated via `conda activate .tox/dev`.
67
68 Then, run `pre-commit install --install-hooks` from the root of the
69 filesystem_spec repository to setup pre-commit hooks. `black` will now be run
70 before you commit, reformatting any changed files. You can format without
71 committing via `pre-commit run` or skip these checks with `git commit
72 --no-verify`.
0 #!/usr/bin/env bash
1 # https://docs.travis-ci.com/user/docker/#using-docker-compose
2
3
4 DOCKER_COMPOSE_VERSION=${DOCKER_COMPOSE_VERSION:-1.23.2}
5
6 # Install docker
7 curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
8 sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
9 sudo apt-get update
10 sudo apt-get -y -o Dpkg::Options::="--force-confnew" install docker-ce
11
12 # Update docker-compose
13 sudo rm /usr/local/bin/docker-compose
14 curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose
15 chmod +x docker-compose
16 sudo mv docker-compose /usr/local/bin
17
18 # install FUSE
19 sudo apt-get install libfuse-dev
20
21 # install conda
22 source $(dirname $BASH_SOURCE)/install_conda.sh
0 #!/usr/bin/env bash
1
2 # Install conda
3 wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
4 bash miniconda.sh -b -p $HOME/miniconda
5 export PATH="$HOME/miniconda/bin:$PATH"
6 conda config --set always_yes yes --set changeps1 no
7 conda update conda
8 conda install -c conda-forge tox tox-conda
0 # Minimal makefile for Sphinx documentation
1 #
2
3 # You can set these variables from the command line.
4 SPHINXOPTS =
5 SPHINXBUILD = sphinx-build
6 SPHINXPROJ = fsspec
7 SOURCEDIR = source
8 BUILDDIR = build
9
10 # Put it first so that "make" without argument is like "make help".
11 help:
12 @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13
14 .PHONY: help Makefile
15
16 # Catch-all target: route all unknown targets to Sphinx using the new
17 # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 %: Makefile
19 @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
0 # Building Documentation
1
2 A basic python environment with packages listed in `./requirements.txt` is
3 required to build the docs, see ``environment.yml``.
4
5 To make HTML documentation:
6
7 ```bash
8 make html
9 ```
10
11 Outputs to `build/html/index.html`
0 name: fsspec
1 channels:
2 - defaults
3 - conda-forge
4 dependencies:
5 - python=3.6
6 - paramiko
7 - requests
8 - numpydoc
0 @ECHO OFF
1
2 pushd %~dp0
3
4 REM Command file for Sphinx documentation
5
6 if "%SPHINXBUILD%" == "" (
7 set SPHINXBUILD=sphinx-build
8 )
9 set SOURCEDIR=source
10 set BUILDDIR=build
11 set SPHINXPROJ=fsspec
12
13 if "%1" == "" goto help
14
15 %SPHINXBUILD% >NUL 2>NUL
16 if errorlevel 9009 (
17 echo.
18 echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 echo.installed, then set the SPHINXBUILD environment variable to point
20 echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 echo.may add the Sphinx directory to PATH.
22 echo.
23 echo.If you don't have Sphinx installed, grab it from
24 echo.http://sphinx-doc.org/
25 exit /b 1
26 )
27
28 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 goto end
30
31 :help
32 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33
34 :end
35 popd
0 API Reference
1 =============
2
3 .. currentmodule:: fsspec
4
5 User Functions
6 --------------
7
8 .. autosummary::
9 fsspec.open_files
10 fsspec.open
11 fsspec.filesystem
12 fsspec.get_filesystem_class
13 fsspec.get_mapper
14 fsspec.fuse.run
15
16 .. autofunction:: fsspec.open_files
17 .. autofunction:: fsspec.open
18 .. autofunction:: fsspec.filesystem
19 .. autofunction:: fsspec.get_filesystem_class
20 .. autofunction:: fsspec.get_mapper
21 .. autofunction:: fsspec.fuse.run
22
23 Base Classes
24 ------------
25
26 .. autosummary::
27 fsspec.spec.AbstractFileSystem
28 fsspec.spec.Transaction
29 fsspec.spec.AbstractBufferedFile
30 fsspec.FSMap
31 fsspec.core.OpenFile
32 fsspec.core.BaseCache
33
34 .. autoclass:: fsspec.spec.AbstractFileSystem
35
36 .. autoclass:: fsspec.spec.Transaction
37 :members:
38
39 .. autoclass:: fsspec.spec.AbstractBufferedFile
40 :members:
41
42 .. autoclass:: fsspec.FSMap
43 :members:
44
45 .. autoclass:: fsspec.core.OpenFile
46 :members:
47
48 .. autoclass:: fsspec.core.BaseCache
49 :members:
50
51
52 .. _implementations:
53
54 Built-in Implementations
55 ------------------------
56
57 .. autosummary::
58 fsspec.implementations.ftp.FTPFileSystem
59 fsspec.implementations.hdfs.PyArrowHDFS
60 fsspec.implementations.http.HTTPFileSystem
61 fsspec.implementations.local.LocalFileSystem
62 fsspec.implementations.memory.MemoryFileSystem
63 fsspec.implementations.sftp.SFTPFileSystem
64 fsspec.implementations.webhdfs.WebHDFS
65 fsspec.implementations.zip.ZipFileSystem
66 fsspec.implementations.cached.CachingFileSystem
67 fsspec.implementations.cached.WholeFileCacheFileSystem
68
69 .. autoclass:: fsspec.implementations.ftp.FTPFileSystem
70 :members: __init__
71
72 .. autoclass:: fsspec.implementations.hdfs.PyArrowHDFS
73 :members: __init__
74
75 .. autoclass:: fsspec.implementations.http.HTTPFileSystem
76 :members: __init__
77
78 .. autoclass:: fsspec.implementations.local.LocalFileSystem
79 :members:
80
81 .. autoclass:: fsspec.implementations.memory.MemoryFileSystem
82 :members: __init__
83
84 .. autoclass:: fsspec.implementations.sftp.SFTPFileSystem
85 :members: __init__
86
87 .. autoclass:: fsspec.implementations.webhdfs.WebHDFS
88 :members: __init__
89
90 .. autoclass:: fsspec.implementations.zip.ZipFileSystem
91 :members: __init__
92
93 .. autoclass:: fsspec.implementations.cached.CachingFileSystem
94 :members: __init__
95
96 .. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem
97
98 .. _readbuffering:
99
100 Read Buffering
101 --------------
102
103 .. autosummary::
104
105 fsspec.caching.ReadAheadCache
106 fsspec.caching.BytesCache
107 fsspec.caching.MMapCache
108 fsspec.caching.BlockCache
109
110 .. autoclass:: fsspec.caching.ReadAheadCache
111 :members:
112
113 .. autoclass:: fsspec.caching.BytesCache
114 :members:
115
116 .. autoclass:: fsspec.caching.MMapCache
117 :members:
118
119 .. autoclass:: fsspec.caching.BlockCache
120 :members:
0 Changelog
1 =========
2
3 Version 0.6.0
4 -------------
5
6 * Fixed issues with filesystem instance caching. This was causing authorization errors
7 in downstream libraries like ``gcsfs`` and ``s3fs`` in multi-threaded code (:pr:`155`, :pr:`181`)
8 * Changed the default file caching strategy to :class:`fsspec.caching.ReadAheadCache` (:pr:`193`)
9 * Moved file caches to the new ``fsspec.caching`` module. They're still available from
10 their old location in ``fsspec.core``, but we recommend using the new location for new code (:pr:`195`)
11 * Added a new file caching strategy, :class:`fsspec.caching.BlockCache` for fetching and caching
12 file reads in blocks (:pr:`191`).
13 * Fixed equality checks for file system instance to return ``False`` when compared to objects
14 other than file systems (:pr:`192`)
15 * Fixed a bug in :meth:`fsspec.FSMap.keys` returning a generator, which was consumed upon iteration (:pr:`189`).
16 * Removed the magic addition of aliases in ``AbstractFileSystem.__init__``. Now alias methods are always
17 present (:pr:`177`)
18 * Deprecated passing ``trim`` to :class:`fsspec.spec.AbstractBufferedFile`. Pass it in ``storage_options`` instead (:pr:`188`)
19 * Improved handling of requests for :class:`fsspec.implementations.http.HTTPFileSystem` when the
20 HTTP server responds with an (incorrect) content-length of 0 (:pr:`163`)
21 * Added a ``detail=True`` parameter to :meth:`fsspec.spec.AbstractFileSystem.ls` (:pr:`168`)
22 * Fixed handling of UNC/DFS paths (:issue:`154`)
0 # -*- coding: utf-8 -*-
1 #
2 # fsspec documentation build configuration file, created by
3 # sphinx-quickstart on Mon Jan 15 18:11:02 2018.
4 #
5 # This file is execfile()d with the current directory set to its
6 # containing dir.
7 #
8 # Note that not all possible configuration values are present in this
9 # autogenerated file.
10 #
11 # All configuration values have a default; values that are commented out
12 # serve to show the default.
13
14 # If extensions (or modules to document with autodoc) are in another directory,
15 # add these directories to sys.path here. If the directory is relative to the
16 # documentation root, use os.path.abspath to make it absolute, like shown here.
17 #
18 import os
19 import sys
20
21 sys.path.insert(0, os.path.abspath("../.."))
22
23
24 # -- General configuration ------------------------------------------------
25
26 # If your documentation needs a minimal Sphinx version, state it here.
27 #
28 # needs_sphinx = '1.0'
29
30 # Add any Sphinx extension module names here, as strings. They can be
31 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 # ones.
33 extensions = [
34 "sphinx.ext.autodoc",
35 "sphinx.ext.viewcode",
36 "sphinx.ext.autosummary",
37 "sphinx.ext.extlinks",
38 "numpydoc",
39 ]
40
41 # Add any paths that contain templates here, relative to this directory.
42 templates_path = ["_templates"]
43
44 # The suffix(es) of source filenames.
45 # You can specify multiple suffix as a list of string:
46 #
47 # source_suffix = ['.rst', '.md']
48 source_suffix = ".rst"
49
50 # The master toctree document.
51 master_doc = "index"
52
53 # General information about the project.
54 project = "fsspec"
55 copyright = "2018, Martin Durant"
56 author = "Martin Durant"
57
58 # The version info for the project you're documenting, acts as replacement for
59 # |version| and |release|, also used in various other places throughout the
60 # built documents.
61 #
62 # The short X.Y version.
63 import fsspec
64
65 version = fsspec.__version__
66 # The full version, including alpha/beta/rc tags.
67 release = fsspec.__version__
68
69 # The language for content autogenerated by Sphinx. Refer to documentation
70 # for a list of supported languages.
71 #
72 # This is also used if you do content translation via gettext catalogs.
73 # Usually you set "language" from the command line for these cases.
74 language = None
75
76 # List of patterns, relative to source directory, that match files and
77 # directories to ignore when looking for source files.
78 # This patterns also effect to html_static_path and html_extra_path
79 exclude_patterns = []
80
81 # The name of the Pygments (syntax highlighting) style to use.
82 pygments_style = "sphinx"
83
84 # If true, `todo` and `todoList` produce output, else they produce nothing.
85 todo_include_todos = False
86
87
88 # -- Options for HTML output ----------------------------------------------
89
90 # The theme to use for HTML and HTML Help pages. See the documentation for
91 # a list of builtin themes.
92 #
93 html_theme = "sphinx_rtd_theme"
94
95 # Theme options are theme-specific and customize the look and feel of a theme
96 # further. For a list of options available for each theme, see the
97 # documentation.
98 #
99 # html_theme_options = {}
100
101 # Add any paths that contain custom static files (such as style sheets) here,
102 # relative to this directory. They are copied after the builtin static files,
103 # so a file named "default.css" will overwrite the builtin "default.css".
104 html_static_path = []
105
106 # Custom sidebar templates, must be a dictionary that maps document names
107 # to template names.
108 #
109 # This is required for the alabaster theme
110 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
111 html_sidebars = {
112 "**": [
113 "relations.html", # needs 'show_related': True theme option to display
114 "searchbox.html",
115 ]
116 }
117
118
119 # -- Options for HTMLHelp output ------------------------------------------
120
121 # Output file base name for HTML help builder.
122 htmlhelp_basename = "fsspecdoc"
123
124
125 # -- Options for LaTeX output ---------------------------------------------
126
127 latex_elements = {
128 # The paper size ('letterpaper' or 'a4paper').
129 #
130 # 'papersize': 'letterpaper',
131 # The font size ('10pt', '11pt' or '12pt').
132 #
133 # 'pointsize': '10pt',
134 # Additional stuff for the LaTeX preamble.
135 #
136 # 'preamble': '',
137 # Latex figure (float) alignment
138 #
139 # 'figure_align': 'htbp',
140 }
141
142 # Grouping the document tree into LaTeX files. List of tuples
143 # (source start file, target name, title,
144 # author, documentclass [howto, manual, or own class]).
145 latex_documents = [
146 (master_doc, "fsspec.tex", "fsspec Documentation", "Joseph Crail", "manual")
147 ]
148
149
150 # -- Options for manual page output ---------------------------------------
151
152 # One entry per manual page. List of tuples
153 # (source start file, name, description, authors, manual section).
154 man_pages = [(master_doc, "fsspec", "fsspec Documentation", [author], 1)]
155
156
157 # -- Options for Texinfo output -------------------------------------------
158
159 # Grouping the document tree into Texinfo files. List of tuples
160 # (source start file, target name, title, author,
161 # dir menu entry, description, category)
162 texinfo_documents = [
163 (
164 master_doc,
165 "fsspec",
166 "fsspec Documentation",
167 author,
168 "fsspec",
169 "One line description of project.",
170 "Miscellaneous",
171 )
172 ]
173
174 extlinks = {
175 "issue": ("https://github.com/intake/filesystem_spec/issues/%s", "GH#"),
176 "pr": ("https://github.com/intake/filesystem_spec/pull/%s", "GH#"),
177 }
0 Features of fsspec
1 ==================
2
3 Consistent API to many different storage backends. The general API and functionality were
4 proven with the projects `s3fs`_ and `gcsfs`_ (along with `hdfs3`_ and `adlfs`_), within the
5 context of Dask and independently. These have been tried and tested by many users and shown their
6 usefulness over some years. ``fsspec`` aims to build on these and unify their models, as well
7 as extract out file-system handling code from Dask which does not so comfortably fit within a
8 library designed for task-graph creation and their scheduling.
9
10 .. _s3fs: https://s3fs.readthedocs.io/en/latest/
11 .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/
12 .. _hdfs3: https://hdfs3.readthedocs.io/en/latest/
13 .. _adlfs: https://azure-datalake-store.readthedocs.io/en/latest/
14
15 Here follows a brief description of some features of note of ``fsspec`` that promide to make
16 it an interesting project beyond some other file-system abstractions
17
18 Serialisability
19 ---------------
20
21 Coming out of the Dask stable, it was an important design decision that file-system instances
22 be serialisable, so that they could be created in one process (e.g., the client) and used in
23 other processes (typically the workers). These other processes may even be on other machines,
24 so in many cases they would need to be able to re-establish credentials, ideally without passing
25 sensitive tokens in the pickled binary data.
26
27 ``fsspec`` instances, generally speaking, abide by these rules, do not include locks, files and other
28 thread-local material, and where possible, use local credentials (such as a token file)
29 for re-establishing sessions upon de-serialisation. (While making use of cached instances, where
30 they exist, see below).
31
32 ``OpenFile`` instances
33 ----------------------
34
35 The :func:`fsspec.core.OpenFile` class provides a convenient way to prescribe the manner to
36 open some file (local,
37 remote, in a compressed store, etc.) which is portable, and ca also apply any compression and
38 text-mode to the file. These instances are also serialisable, because the do not contain any open
39 files.
40
41 The way to work with ``OpenFile`` s is to isolate interaction with in a ``with`` context. It is
42 the initiation of the context which actually does the work of creating file-like instances.
43
44 .. code-block:: python
45
46 of = fsspec.open(url, ...)
47 # of is just a place-holder
48 with of as f:
49 # f is now a real file-like object holding resources
50 f.read(...)
51
52 Random Access and Buffering
53 ---------------------------
54
55 The :func:`fsspec.spec.AbstractBufferedFile` class is provided as an easy way to build file-like
56 interfaces to some service which is capable of providing blocks of bytes. This class is derived
57 from in a number of the existing implementations. A subclass of ``AbstractBufferedFile`` provides
58 random access for the underlying file-like data (without downloading the whole thing) and
59 configurable read-ahead buffers to minimise the number of the read operations that need to be
60 performed on the back-end storage.
61
62 This is also a critical feature in the big-data access model, where each sub-task of an operation
63 may need on a small part of a file, and does not, therefore want to be forces into downloading the
64 whole thing.
65
66 Transparent text-mode and compression
67 -------------------------------------
68
69 As mentioned above, the ``OpenFile`` class allows for the opening of files on a binary store,
70 which appear to be in text mode and/or allow for a compression/decompression layer between the
71 caller and the back-end storage system. From the user's point of view, this is achieved simply
72 by passing arguments to the :func:`fsspec.open_files` or :func:`fsspec.open` functions, and
73 thereafter happens transparently.
74
75 Key-value stores
76 ----------------
77
78 File-systems are naturally like dict-like key-value mappings: each (string) path corresponds to some
79 binary data on the storage back-end. For some use-cases, it is very convenient to be able to
80 view some path within the file-system as a dict-like store, and the function :func:`fsspec.get_mapper`
81 gives a one-stop way to return such an object. This has become useful, for example, in the
82 context of the `zarr`_ project, which stores it array chunks in keys in any arbitrary mapping-like
83 object.
84
85 .. code-block:: python
86
87 mapper = fsspec.get_mapper('protocol://server/path', args)
88 list(mapper)
89 mapper[k] = b'some data'
90
91 .. _zarr: https://zarr.readthedocs.io/en/stable/
92
93 PyArrow integration
94 -------------------
95
96 `pyarrow`_ has its own internal idea of what a file-system is (``pyarrow.filesystem.FileSystem``),
97 and some functions, particularly the loading of parquet, require that the target be compatible.
98 As it happens, the design of the file-system interface in ``pyarrow`` *is* compatible with `fsspec`
99 (this is not by accident). Therefore at import time, ``fsspec`` checks for the existence of
100 ``pyarrow``, and, if found, adds it to the superclasses of the spec base-class. In this manner,
101 all ``fsspec``-derived file-systems are also pyarrow file-systems, and can be used by pyarrow
102 functions.
103
104 .. _pyarrow: https://arrow.apache.org/docs/python/
105
106 Transactions
107 ------------
108
109 ``fsspec`` supports *transactions*, during which writing to files on a remote store are deferred
110 (typically put into a temporary location) until the transaction is over, whereupon the whole
111 transaction is finalised in a semi-atomic way, and all the files are moved/committed to their
112 final destination. The implementation of the details is file-system specific (and not all
113 support it yet), but the idea is,
114 that all files should get written or none, to mitigate against data corruption. The feature
115 can be used like
116
117 .. code-block:: python
118
119 fs = fsspec.filesystem(...)
120 with fs.transation:
121 with fs.open('file1', 'wb') as f:
122 f.write(b'some data')
123 with fs.open('file2', 'wb') as f:
124 f.write(b'more data')
125
126 Here, files 1 and 2 do not get moved to the target location until the transaction context finishes.
127 If the context finishes due to an (uncaught) exception, then the files are discarded and the
128 file target locations untouched.
129
130 The class :func:`fsspec.spec.Transaction` allows for fine-tuning of the operation, and every
131 ``fsspec`` instance has an instance of this as an attribute ``.transaction`` to give access.
132
133 Note that synchronising transactions across multiple instances, perhaps across a cluster,
134 is a harder problem to solve, and the implementation described here is only part of the solution.
135
136 Mount anything with FUSE
137 ------------------------
138
139 Any path of any file-system can be mapped to a local directory using pyfuse and
140 :func:`sspec.fuse.run`. This feature is experimental, but basic file listing with
141 details, and read/write should generally be available to the extent that the
142 remote file-system provides enough information. Naturally, if a file-system is read-only,
143 then write operations will fail - but they will tend to fail late and with obscure
144 error messages such as "bad address".
145
146 Some specific quirks of some file-systems may cause confusion for FUSE. For example,
147 it is possible for a given path on s3 to be both a valid key (i.e., containing binary
148 data, like a file) and a valid prefix (i.e., can be listed to find subkeys, like a
149 directory). Since this breaks the assumptions of a normal file-system, it may not
150 be possible to reach all paths on the remote.
151
152 Instance Caching
153 ----------------
154
155 In a file-system implementation class is marked as *cachable* (attribute ``.cachable``),
156 then its instances will
157 get stored in a class attribute, to enable quick look-up instead of needing to regenerate
158 potentially expensive connections and sessions. They key in the cache is a tokenisation of
159 the arguments to create the instance. The cache itself (attribute ``._cache``)
160 is currently a simple dict, but could in the future be LRU, or something more complicated,
161 to fine-tune instance lifetimes.
162
163 Since files can hold on to write caches and read buffers,
164 the instance cache may cause excessive memory usage in some situations; but normally, files
165 will get ``close``d, and the data discarded. Only when there is also an unfinalised transaction or
166 captured traceback might this be anticipated becoming a problem.
167
168 File Buffering
169 --------------
170
171 Most implementations create file objects which derive from ``fsspec.spec.AbstractBufferedFile``, and
172 have many behaviours in common. These files offer buffering of both read and write operations, so that
173 communication with the remote resource is limited. The size of the buffer is generally configured
174 with the ``blocksize=`` kwargs at p[en time, although the implementation may have some minimum or
175 maximum sizes that need to be respected.
176
177 For reading, a number of buffering schemes are available, listed in ``fsspec.caching.caches``
178 (see :ref:`readbuffering`), or "none" for no buffering at all, e.g., for a simple read-ahead
179 buffer, you can do
180
181 .. code-block:: python
182
183 fs = fsspec.filesystem(...)
184 with fs.open(path, mode='rb', cache_type='readahead') as f:
185 use_for_something(f)
186
187 Caching Files Locally
188 ---------------------
189
190 ``fsspec`` allows you to access data on remote file systems, that is its purpose. However, such
191 access can often be rather slow compared to local storage, so as well as buffering (see above), the
192 option exists to cp[y files locally when you first access them, and thereafter to use the local data.
193 This local cache of data might be temporary (i.e., attached to the process and discarded when the
194 process ends) or at some specific location in your local storage.
195
196 Two mechanisms are provided, and both involve wrapping a `target` filesystem. The following example
197 creates a file-based cache.
198
199 .. code-block:: python
200
201 fs = fsspec.filesystem("filecache", target_protocol='s3', target_options={'anon': True},
202 cache_storage='/tmp/files/')
203
204 Each time you open a remote file on S3, it will first copy it to
205 a local temporary directory, and then all further access will use the local file. Since we specify
206 a particular local location, the files will persist and can be reused from future sessions, although
207 you can also set policies to have cached files expire after some time, or to check the remote file system
208 on each open, to see if the target file has changed since it was copied.
209
210 With the "blockcache" variant, data is downloaded block-wise: only the specific parts of the remote file
211 which are accessed. This means that the local copy of the file might end up being much smaller than the
212 remote one, if only certain parts of it are required.
213
214 Whereas "filecache" works for all file system implementations, and provides a real local file for other
215 libraries to use, "blockcache" has restrictions: that you have a storage/OS combination which supports
216 sparse files, that the backend implementation uses files which derive ``from AbstractBufferedFile``,
217 and that the library you pass the resultant object to accepts generic python file-like objects. You
218 should not mix block- and file-caches in the same directory.
0 fsspec's: python filesystem interfaces
1 ======================================
2
3 Filesystem Spec is a project to unify various projects and classes to work with remote filesystems and
4 file-system-like abstractions using a standard pythonic interface.
5
6
7 .. _highlight:
8
9 Highlights
10 ----------
11
12 - based on s3fs and gcsfs
13 - ``fsspec`` instances are serializable and can be passed between processes/machines
14 - the ``OpenFiles`` file-like instances are also serializable
15 - implementations provide random access, to enable only the part of a file required to be read; plus a template
16 to base other file-like classes on
17 - file access can use transparent compression and text-mode
18 - any file-system directory can be viewed as a key-value/mapping store
19 - if installed, all file-system classes also subclass from ``pyarrow.filesystem.FileSystem``, so
20 can work with any arrow function expecting such an instance
21 - writes can be transactional: stored in a temporary location and only moved to the final
22 destination when the transaction is committed
23 - FUSE: mount any path from any backend to a point on your file-system
24 - cached instances tokenised on the instance parameters
25
26 These are described further in the :doc:`features` section.
27
28 Installation
29 ------------
30
31 pip install fsspec
32
33 or
34
35 conda install -c conda-forge fsspec
36
37 Implementations
38 ---------------
39
40 This repo contains several file-system implementations, see :ref:`implementations`. However,
41 the external projects ``s3fs`` and ``gcsfs`` depend on ``fsspec`` and share the same behaviours.
42 ``Dask`` and ``Intake`` use ``fsspec`` internally for their IO needs.
43
44 The current list of known implementations can be found as follows
45
46 .. code-block:: python
47
48 from fsspec.registry import known_implementations
49 known_implementations
50
51 These are only imported on request, which may fail if a required dependency is missing. The dictionary
52 ``fsspec.registry`` contains all imported implementations, and can be mutated by user code, if necessary.
53
54
55 .. toctree::
56 :maxdepth: 2
57 :caption: Contents:
58
59 intro.rst
60 usage.rst
61 features.rst
62 api.rst
63 changelog.rst
64
65
66 Indices and tables
67 ==================
68
69 * :ref:`genindex`
70 * :ref:`modindex`
71 * :ref:`search`
0 Introduction
1 ============
2
3 To get stuck into using the package, rather than reading about its philosophy and history, you can
4 skip to :doc:`usage`.
5
6 Background
7 ----------
8
9 Python provides a standard interface for open files, so that alternate implementations of file-like object can
10 work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries
11 have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system
12 which may be local, structured data store or some remote service.
13
14 This repository is intended to be a place to define a standard interface that such file-systems should adhere to,
15 such that code using them should not have to know the details of the implementation in order to operate on any of
16 a number of backends. With hope, the community can come together to
17 define an interface that is the best for the highest number of users, and having the specification, makes developing
18 other file-system implementations simpler.
19
20 History
21 -------
22
23 I (Martin Durant) have been involved in building a number of remote-data file-system implementations, principally
24 in the context of the `Dask`_ project. In particular, several are listed
25 in `docs`_ with links to the specific repositories.
26 With common authorship, there is much that is similar between the implementations, for example posix-like naming
27 of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic
28 URLs in order to select amongst them. However, *some* extra code was required in each case to adapt the peculiarities
29 of each implementation with the generic usage that Dask demanded. People may find the
30 `code`_ which parses URLs and creates file-system
31 instances interesting.
32
33 .. _Dask: http://dask.pydata.org/en/latest/
34 .. _docs: http://dask.pydata.org/en/latest/remote-data-services.html
35 .. _code: https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266
36
37 At the same time, the Apache `Arrow`_ project was also concerned with a similar problem,
38 particularly a common interface to local and HDFS files, for example the
39 `hdfs`_ interface (which actually communicated with HDFS
40 with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able
41 to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a
42 `conversation`_
43 was started, and I invite all interested parties to continue the conversation in this location.
44
45 .. _Arrow: https://arrow.apache.org/
46 .. _hdfs: https://arrow.apache.org/docs/python/filesystems.html
47 .. _conversation: https://github.com/dask/dask/issues/2880
48
49 There is a good argument that this type of code has no place in Dask, which is concerned with making graphs
50 representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful,
51 and each has a user-base wider than just those that work via Dask.
52
53 Influences
54 ----------
55
56 The following places to consider, when choosing the definitions of how we would like the file-system specification
57 to look:
58
59 - python's `os`_ module and its `path` namespace; also other file-connected
60 functionality in the standard library
61 - posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants
62 - the existing implementations for the various backends (e.g.,
63 `gcsfs`_ or Arrow's
64 `hdfs`_)
65 - `pyfilesystems`_, an attempt to do something similar, with a
66 plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out
67 validation code.
68
69 .. _os: https://docs.python.org/3/library/os.html
70 .. _gcsfs: http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem
71 .. _pyfilesystems: https://docs.pyfilesystem.org/en/latest/index.html
72
73 Not pyfilesystems?
74 ------------------
75
76 It might have been conceivable to reuse code in ``pyfilesystems``, which has an established interface and several
77 implementations of its own. However, it supports none of the :ref:`highlight`, critical to
78 cloud and parallel access, and would not be easy to
79 coerce. Following on the success of ``s3fs`` and ``gcsfs``, and their use within Dask, it seemed best to
80 have an interface as close to those as possible. See a
81 `discussion`_ on the topic.
82
83 .. _discussion: https://github.com/intake/filesystem_spec/issues/5
84
85 Structure of the package
86 ------------------------
87
88 The best place to get a feel for the contents of ``fsspec`` is by looking through the :doc:`usage` and
89 :doc:`api` sections. In addition, the source code will be interesting for those who wish to subclass and
90 develop new file-system implementations. ``fsspec/spec.py`` contains the main abstract file-system class
91 to derive from, ``AbstractFileSystem``.
92
93 .. _zarr: https://zarr.readthedocs.io
0 Usage
1 =====
2
3 This is quick-start documentation to help people get familiar with the layout and functioning of ``fsspec``.
4
5 Instantiate a file-system
6 -------------------------
7
8 ``fsspec`` provides an abstract file-system interface as a template for other filesystems. In this context,
9 "interface" means an API for working with files on the given file-system, which can mean files on some
10 remote store, local files, files within some wrapper, or anything else that is capable of producing
11 file-like objects.
12
13 Some concrete implementations are bundled with ``fsspec`` and others can be installed separately. They
14 can be instantiated directly, or the `registry` can be used to find them.
15
16 Direct instantiation:
17
18 .. code-block:: python
19
20 from fsspec.implementations.local import LocalFileSystem
21 fs = LocalFileSystem()
22
23 Look-up via registry:
24
25 .. code-block:: python
26
27 import fsspec
28 fs = fsspec.filesystem('file')
29
30 Many filesystems also take extra parameters, some of which may be options - see :doc:`api`.
31
32 .. code-block:: python
33
34 import fsspec
35 fs = fsspec.filesystem('ftp', host=host, port=port,
36 username=user, password=pw)
37
38 Use a file-system
39 -----------------
40
41 File-system instances offer a large number of methods for getting information about and manipulating files
42 for the given back-end. Although some specific implementations may not offer all features (e.g., ``http``
43 is read-only), generally all normal operations, such as ``ls``, ``rm``, should be expected to work (see the
44 full list: :class:`fsspec.spec.AbstractFileSystem`).
45 Note that this quick-start will prefer posix-style naming, but
46 many common operations are aliased: ``cp()`` and ``copy()`` are identical, for instance.
47 Functionality is generally chosen to be as close to the builtin ``os`` module's working for things like
48 ``glob`` as possible.
49
50 The ``open()`` method will return a file-like object which can be passed to any other library that expects
51 to work with python files. These will normally be binary-mode only, but may implement internal buffering
52 in order to limit the number of reads from a remote source. They respect the use of ``with`` contexts. If
53 you have ``pandas`` installed, for example, you can do the following:
54
55 .. code-block:: python
56
57 with fs.open('https://raw.githubusercontent.com/dask/'
58 'fastparquet/master/test-data/nation.csv') as f:
59 df = pd.read_csv(f, sep='|', header=None)
60
61 Higher-level
62 ------------
63
64 For many situations, the only function that will be needed is :func:`fsspec.open_files()`, which will return
65 :class:`fsspec.core.OpenFile` instances created from a single URL and parameters to pass to the backend.
66 This supports text-mode and compression on the fly, and the objects can be serialized for passing between
67 processes or machines (so long as each has access to the same backend file-system). The protocol (i.e.,
68 backend) is inferred from the URL passed, and glob characters are expanded in read mode (search for files)
69 or write mode (create names). Critically, the file on the backend system is not actually opened until the
70 ``OpenFile`` instance is used in a ``with`` context. For the example above:
71
72 .. code-block:: python
73
74 of = fsspec.open('https://raw.githubusercontent.com/dask/'
75 'fastparquet/master/test-data/nation.csv', mode='r')
76 # files is a not-yet-open OpenFile object. The "with" context actually opens it
77 with of as f:
78 # now f is a text-mode file
79 df = pd.read_csv(f, sep='|', header=None)
80
0 from ._version import get_versions
1
2 from .spec import AbstractFileSystem
3 from .registry import get_filesystem_class, registry, filesystem
4 from .mapping import FSMap, get_mapper
5 from .core import open_files, get_fs_token_paths, open
6 from . import caching
7
8 __version__ = get_versions()["version"]
9 del get_versions
10
11
12 __all__ = [
13 "AbstractFileSystem",
14 "FSMap",
15 "filesystem",
16 "get_filesystem_class",
17 "get_fs_token_paths",
18 "get_mapper",
19 "open",
20 "open_files",
21 "registry",
22 "caching",
23 ]
0 # This file helps to compute a version number in source trees obtained from
1 # git-archive tarball (such as those provided by githubs download-from-tag
2 # feature). Distribution tarballs (built by setup.py sdist) and build
3 # directories (produced by setup.py build) will contain a much shorter file
4 # that just contains the computed version number.
5
6 # This file is released into the public domain. Generated by
7 # versioneer-0.18 (https://github.com/warner/python-versioneer)
8
9 """Git implementation of _version.py."""
10
11 import errno
12 import os
13 import re
14 import subprocess
15 import sys
16
17
18 def get_keywords():
19 """Get the keywords needed to look up the version information."""
20 # these strings will be replaced by git during git-archive.
21 # setup.py/versioneer.py will grep for the variable names, so they must
22 # each be defined on a line of their own. _version.py will just call
23 # get_keywords().
24 git_refnames = " (tag: 0.6.0)"
25 git_full = "8b59dc8c2c035db5793102b9513c46e6a1bd4fb0"
26 git_date = "2019-11-13 10:37:40 -0600"
27 keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
28 return keywords
29
30
31 class VersioneerConfig:
32 """Container for Versioneer configuration parameters."""
33
34
35 def get_config():
36 """Create, populate and return the VersioneerConfig() object."""
37 # these strings are filled in when 'setup.py versioneer' creates
38 # _version.py
39 cfg = VersioneerConfig()
40 cfg.VCS = "git"
41 cfg.style = "pep440"
42 cfg.tag_prefix = ""
43 cfg.parentdir_prefix = "None"
44 cfg.versionfile_source = "fsspec/_version.py"
45 cfg.verbose = False
46 return cfg
47
48
49 class NotThisMethod(Exception):
50 """Exception raised if a method is not valid for the current scenario."""
51
52
53 LONG_VERSION_PY = {}
54 HANDLERS = {}
55
56
57 def register_vcs_handler(vcs, method): # decorator
58 """Decorator to mark a method as the handler for a particular VCS."""
59
60 def decorate(f):
61 """Store f in HANDLERS[vcs][method]."""
62 if vcs not in HANDLERS:
63 HANDLERS[vcs] = {}
64 HANDLERS[vcs][method] = f
65 return f
66
67 return decorate
68
69
70 def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
71 """Call the given command(s)."""
72 assert isinstance(commands, list)
73 p = None
74 for c in commands:
75 try:
76 dispcmd = str([c] + args)
77 # remember shell=False, so use git.cmd on windows, not just git
78 p = subprocess.Popen(
79 [c] + args,
80 cwd=cwd,
81 env=env,
82 stdout=subprocess.PIPE,
83 stderr=(subprocess.PIPE if hide_stderr else None),
84 )
85 break
86 except EnvironmentError:
87 e = sys.exc_info()[1]
88 if e.errno == errno.ENOENT:
89 continue
90 if verbose:
91 print("unable to run %s" % dispcmd)
92 print(e)
93 return None, None
94 else:
95 if verbose:
96 print("unable to find command, tried %s" % (commands,))
97 return None, None
98 stdout = p.communicate()[0].strip()
99 if sys.version_info[0] >= 3:
100 stdout = stdout.decode()
101 if p.returncode != 0:
102 if verbose:
103 print("unable to run %s (error)" % dispcmd)
104 print("stdout was %s" % stdout)
105 return None, p.returncode
106 return stdout, p.returncode
107
108
109 def versions_from_parentdir(parentdir_prefix, root, verbose):
110 """Try to determine the version from the parent directory name.
111
112 Source tarballs conventionally unpack into a directory that includes both
113 the project name and a version string. We will also support searching up
114 two directory levels for an appropriately named parent directory
115 """
116 rootdirs = []
117
118 for i in range(3):
119 dirname = os.path.basename(root)
120 if dirname.startswith(parentdir_prefix):
121 return {
122 "version": dirname[len(parentdir_prefix) :],
123 "full-revisionid": None,
124 "dirty": False,
125 "error": None,
126 "date": None,
127 }
128 else:
129 rootdirs.append(root)
130 root = os.path.dirname(root) # up a level
131
132 if verbose:
133 print(
134 "Tried directories %s but none started with prefix %s"
135 % (str(rootdirs), parentdir_prefix)
136 )
137 raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
138
139
140 @register_vcs_handler("git", "get_keywords")
141 def git_get_keywords(versionfile_abs):
142 """Extract version information from the given file."""
143 # the code embedded in _version.py can just fetch the value of these
144 # keywords. When used from setup.py, we don't want to import _version.py,
145 # so we do it with a regexp instead. This function is not used from
146 # _version.py.
147 keywords = {}
148 try:
149 f = open(versionfile_abs, "r")
150 for line in f.readlines():
151 if line.strip().startswith("git_refnames ="):
152 mo = re.search(r'=\s*"(.*)"', line)
153 if mo:
154 keywords["refnames"] = mo.group(1)
155 if line.strip().startswith("git_full ="):
156 mo = re.search(r'=\s*"(.*)"', line)
157 if mo:
158 keywords["full"] = mo.group(1)
159 if line.strip().startswith("git_date ="):
160 mo = re.search(r'=\s*"(.*)"', line)
161 if mo:
162 keywords["date"] = mo.group(1)
163 f.close()
164 except EnvironmentError:
165 pass
166 return keywords
167
168
169 @register_vcs_handler("git", "keywords")
170 def git_versions_from_keywords(keywords, tag_prefix, verbose):
171 """Get version information from git keywords."""
172 if not keywords:
173 raise NotThisMethod("no keywords at all, weird")
174 date = keywords.get("date")
175 if date is not None:
176 # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
177 # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
178 # -like" string, which we must then edit to make compliant), because
179 # it's been around since git-1.5.3, and it's too difficult to
180 # discover which version we're using, or to work around using an
181 # older one.
182 date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
183 refnames = keywords["refnames"].strip()
184 if refnames.startswith("$Format"):
185 if verbose:
186 print("keywords are unexpanded, not using")
187 raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
188 refs = set([r.strip() for r in refnames.strip("()").split(",")])
189 # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
190 # just "foo-1.0". If we see a "tag: " prefix, prefer those.
191 TAG = "tag: "
192 tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
193 if not tags:
194 # Either we're using git < 1.8.3, or there really are no tags. We use
195 # a heuristic: assume all version tags have a digit. The old git %d
196 # expansion behaves like git log --decorate=short and strips out the
197 # refs/heads/ and refs/tags/ prefixes that would let us distinguish
198 # between branches and tags. By ignoring refnames without digits, we
199 # filter out many common branch names like "release" and
200 # "stabilization", as well as "HEAD" and "master".
201 tags = set([r for r in refs if re.search(r"\d", r)])
202 if verbose:
203 print("discarding '%s', no digits" % ",".join(refs - tags))
204 if verbose:
205 print("likely tags: %s" % ",".join(sorted(tags)))
206 for ref in sorted(tags):
207 # sorting will prefer e.g. "2.0" over "2.0rc1"
208 if ref.startswith(tag_prefix):
209 r = ref[len(tag_prefix) :]
210 if verbose:
211 print("picking %s" % r)
212 return {
213 "version": r,
214 "full-revisionid": keywords["full"].strip(),
215 "dirty": False,
216 "error": None,
217 "date": date,
218 }
219 # no suitable tags, so version is "0+unknown", but full hex is still there
220 if verbose:
221 print("no suitable tags, using unknown + full revision id")
222 return {
223 "version": "0+unknown",
224 "full-revisionid": keywords["full"].strip(),
225 "dirty": False,
226 "error": "no suitable tags",
227 "date": None,
228 }
229
230
231 @register_vcs_handler("git", "pieces_from_vcs")
232 def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
233 """Get version from 'git describe' in the root of the source tree.
234
235 This only gets called if the git-archive 'subst' keywords were *not*
236 expanded, and _version.py hasn't already been rewritten with a short
237 version string, meaning we're inside a checked out source tree.
238 """
239 GITS = ["git"]
240 if sys.platform == "win32":
241 GITS = ["git.cmd", "git.exe"]
242
243 out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
244 if rc != 0:
245 if verbose:
246 print("Directory %s not under git control" % root)
247 raise NotThisMethod("'git rev-parse --git-dir' returned error")
248
249 # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
250 # if there isn't one, this yields HEX[-dirty] (no NUM)
251 describe_out, rc = run_command(
252 GITS,
253 [
254 "describe",
255 "--tags",
256 "--dirty",
257 "--always",
258 "--long",
259 "--match",
260 "%s*" % tag_prefix,
261 ],
262 cwd=root,
263 )
264 # --long was added in git-1.5.5
265 if describe_out is None:
266 raise NotThisMethod("'git describe' failed")
267 describe_out = describe_out.strip()
268 full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
269 if full_out is None:
270 raise NotThisMethod("'git rev-parse' failed")
271 full_out = full_out.strip()
272
273 pieces = {}
274 pieces["long"] = full_out
275 pieces["short"] = full_out[:7] # maybe improved later
276 pieces["error"] = None
277
278 # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
279 # TAG might have hyphens.
280 git_describe = describe_out
281
282 # look for -dirty suffix
283 dirty = git_describe.endswith("-dirty")
284 pieces["dirty"] = dirty
285 if dirty:
286 git_describe = git_describe[: git_describe.rindex("-dirty")]
287
288 # now we have TAG-NUM-gHEX or HEX
289
290 if "-" in git_describe:
291 # TAG-NUM-gHEX
292 mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
293 if not mo:
294 # unparseable. Maybe git-describe is misbehaving?
295 pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
296 return pieces
297
298 # tag
299 full_tag = mo.group(1)
300 if not full_tag.startswith(tag_prefix):
301 if verbose:
302 fmt = "tag '%s' doesn't start with prefix '%s'"
303 print(fmt % (full_tag, tag_prefix))
304 pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
305 full_tag,
306 tag_prefix,
307 )
308 return pieces
309 pieces["closest-tag"] = full_tag[len(tag_prefix) :]
310
311 # distance: number of commits since tag
312 pieces["distance"] = int(mo.group(2))
313
314 # commit: short hex revision ID
315 pieces["short"] = mo.group(3)
316
317 else:
318 # HEX: no tags
319 pieces["closest-tag"] = None
320 count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
321 pieces["distance"] = int(count_out) # total number of commits
322
323 # commit date: see ISO-8601 comment in git_versions_from_keywords()
324 date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
325 0
326 ].strip()
327 pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
328
329 return pieces
330
331
332 def plus_or_dot(pieces):
333 """Return a + if we don't already have one, else return a ."""
334 if "+" in pieces.get("closest-tag", ""):
335 return "."
336 return "+"
337
338
339 def render_pep440(pieces):
340 """Build up version string, with post-release "local version identifier".
341
342 Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
343 get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
344
345 Exceptions:
346 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
347 """
348 if pieces["closest-tag"]:
349 rendered = pieces["closest-tag"]
350 if pieces["distance"] or pieces["dirty"]:
351 rendered += plus_or_dot(pieces)
352 rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
353 if pieces["dirty"]:
354 rendered += ".dirty"
355 else:
356 # exception #1
357 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
358 if pieces["dirty"]:
359 rendered += ".dirty"
360 return rendered
361
362
363 def render_pep440_pre(pieces):
364 """TAG[.post.devDISTANCE] -- No -dirty.
365
366 Exceptions:
367 1: no tags. 0.post.devDISTANCE
368 """
369 if pieces["closest-tag"]:
370 rendered = pieces["closest-tag"]
371 if pieces["distance"]:
372 rendered += ".post.dev%d" % pieces["distance"]
373 else:
374 # exception #1
375 rendered = "0.post.dev%d" % pieces["distance"]
376 return rendered
377
378
379 def render_pep440_post(pieces):
380 """TAG[.postDISTANCE[.dev0]+gHEX] .
381
382 The ".dev0" means dirty. Note that .dev0 sorts backwards
383 (a dirty tree will appear "older" than the corresponding clean one),
384 but you shouldn't be releasing software with -dirty anyways.
385
386 Exceptions:
387 1: no tags. 0.postDISTANCE[.dev0]
388 """
389 if pieces["closest-tag"]:
390 rendered = pieces["closest-tag"]
391 if pieces["distance"] or pieces["dirty"]:
392 rendered += ".post%d" % pieces["distance"]
393 if pieces["dirty"]:
394 rendered += ".dev0"
395 rendered += plus_or_dot(pieces)
396 rendered += "g%s" % pieces["short"]
397 else:
398 # exception #1
399 rendered = "0.post%d" % pieces["distance"]
400 if pieces["dirty"]:
401 rendered += ".dev0"
402 rendered += "+g%s" % pieces["short"]
403 return rendered
404
405
406 def render_pep440_old(pieces):
407 """TAG[.postDISTANCE[.dev0]] .
408
409 The ".dev0" means dirty.
410
411 Eexceptions:
412 1: no tags. 0.postDISTANCE[.dev0]
413 """
414 if pieces["closest-tag"]:
415 rendered = pieces["closest-tag"]
416 if pieces["distance"] or pieces["dirty"]:
417 rendered += ".post%d" % pieces["distance"]
418 if pieces["dirty"]:
419 rendered += ".dev0"
420 else:
421 # exception #1
422 rendered = "0.post%d" % pieces["distance"]
423 if pieces["dirty"]:
424 rendered += ".dev0"
425 return rendered
426
427
428 def render_git_describe(pieces):
429 """TAG[-DISTANCE-gHEX][-dirty].
430
431 Like 'git describe --tags --dirty --always'.
432
433 Exceptions:
434 1: no tags. HEX[-dirty] (note: no 'g' prefix)
435 """
436 if pieces["closest-tag"]:
437 rendered = pieces["closest-tag"]
438 if pieces["distance"]:
439 rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
440 else:
441 # exception #1
442 rendered = pieces["short"]
443 if pieces["dirty"]:
444 rendered += "-dirty"
445 return rendered
446
447
448 def render_git_describe_long(pieces):
449 """TAG-DISTANCE-gHEX[-dirty].
450
451 Like 'git describe --tags --dirty --always -long'.
452 The distance/hash is unconditional.
453
454 Exceptions:
455 1: no tags. HEX[-dirty] (note: no 'g' prefix)
456 """
457 if pieces["closest-tag"]:
458 rendered = pieces["closest-tag"]
459 rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
460 else:
461 # exception #1
462 rendered = pieces["short"]
463 if pieces["dirty"]:
464 rendered += "-dirty"
465 return rendered
466
467
468 def render(pieces, style):
469 """Render the given version pieces into the requested style."""
470 if pieces["error"]:
471 return {
472 "version": "unknown",
473 "full-revisionid": pieces.get("long"),
474 "dirty": None,
475 "error": pieces["error"],
476 "date": None,
477 }
478
479 if not style or style == "default":
480 style = "pep440" # the default
481
482 if style == "pep440":
483 rendered = render_pep440(pieces)
484 elif style == "pep440-pre":
485 rendered = render_pep440_pre(pieces)
486 elif style == "pep440-post":
487 rendered = render_pep440_post(pieces)
488 elif style == "pep440-old":
489 rendered = render_pep440_old(pieces)
490 elif style == "git-describe":
491 rendered = render_git_describe(pieces)
492 elif style == "git-describe-long":
493 rendered = render_git_describe_long(pieces)
494 else:
495 raise ValueError("unknown style '%s'" % style)
496
497 return {
498 "version": rendered,
499 "full-revisionid": pieces["long"],
500 "dirty": pieces["dirty"],
501 "error": None,
502 "date": pieces.get("date"),
503 }
504
505
506 def get_versions():
507 """Get version information or return default if unable to do so."""
508 # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
509 # __file__, we can work backwards from there to the root. Some
510 # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
511 # case we can only use expanded keywords.
512
513 cfg = get_config()
514 verbose = cfg.verbose
515
516 try:
517 return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
518 except NotThisMethod:
519 pass
520
521 try:
522 root = os.path.realpath(__file__)
523 # versionfile_source is the relative path from the top of the source
524 # tree (where the .git directory might live) to this file. Invert
525 # this to find the root from __file__.
526 for i in cfg.versionfile_source.split("/"):
527 root = os.path.dirname(root)
528 except NameError:
529 return {
530 "version": "0+unknown",
531 "full-revisionid": None,
532 "dirty": None,
533 "error": "unable to find root of source tree",
534 "date": None,
535 }
536
537 try:
538 pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
539 return render(pieces, cfg.style)
540 except NotThisMethod:
541 pass
542
543 try:
544 if cfg.parentdir_prefix:
545 return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
546 except NotThisMethod:
547 pass
548
549 return {
550 "version": "0+unknown",
551 "full-revisionid": None,
552 "dirty": None,
553 "error": "unable to compute version",
554 "date": None,
555 }
0 import os
1 import io
2 import functools
3 import logging
4 import math
5
6 logger = logging.getLogger("fsspec")
7
8
9 class BaseCache(object):
10 """Pass-though cache: doesn't keep anything, calls every time
11
12 Acts as base class for other cachers
13
14 Parameters
15 ----------
16 blocksize: int
17 How far to read ahead in numbers of bytes
18 fetcher: func
19 Function of the form f(start, end) which gets bytes from remote as
20 specified
21 size: int
22 How big this file is
23 """
24
25 def __init__(self, blocksize, fetcher, size):
26 self.blocksize = blocksize
27 self.fetcher = fetcher
28 self.size = size
29
30 def _fetch(self, start, end):
31 return self.fetcher(start, end)
32
33 def __getitem__(self, item: slice):
34 if not isinstance(item, slice):
35 raise TypeError(
36 "Cache indices must be a contiguous slice. Got {} instead.".format(
37 type(item)
38 )
39 )
40 if item.step and item.step != 1:
41 raise ValueError(
42 "Cache indices must be a contiguous slice. 'item' has step={}".format(
43 item.step
44 )
45 )
46
47 # handle endpoints
48 if item.start is None:
49 item = slice(0, item.stop)
50 elif item.start < 0:
51 item = slice(self.size + item.start, item.stop)
52 if item.stop is None:
53 item = slice(item.start, self.size)
54 elif item.stop < 0:
55 item = slice(item.start, self.size + item.stop)
56
57 return self._fetch(item.start, item.stop)
58
59
60 class MMapCache(BaseCache):
61 """memory-mapped sparse file cache
62
63 Opens temporary file, which is filled blocks-wise when data is requested.
64 Ensure there is enough disc space in the temporary location.
65
66 This cache method might only work on posix
67 """
68
69 def __init__(self, blocksize, fetcher, size, location=None, blocks=None):
70 super().__init__(blocksize, fetcher, size)
71 self.blocks = set() if blocks is None else blocks
72 self.location = location
73 self.cache = self._makefile()
74
75 def _makefile(self):
76 import tempfile
77 import mmap
78
79 if self.size == 0:
80 return bytearray()
81
82 # posix version
83 if self.location is None or not os.path.exists(self.location):
84 if self.location is None:
85 fd = tempfile.TemporaryFile()
86 self.blocks = set()
87 else:
88 fd = io.open(self.location, "wb+")
89 fd.seek(self.size - 1)
90 fd.write(b"1")
91 fd.flush()
92 else:
93 fd = io.open(self.location, "rb+")
94
95 return mmap.mmap(fd.fileno(), self.size)
96
97 def _fetch(self, start, end):
98 start_block = start // self.blocksize
99 end_block = end // self.blocksize
100 need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
101 while need:
102 # TODO: not a for loop so we can consolidate blocks later to
103 # make fewer fetch calls; this could be parallel
104 i = need.pop(0)
105 sstart = i * self.blocksize
106 send = min(sstart + self.blocksize, self.size)
107 self.cache[sstart:send] = self.fetcher(sstart, send)
108 self.blocks.add(i)
109
110 return self.cache[start:end]
111
112 def __getstate__(self):
113 state = self.__dict__.copy()
114 # Remove the unpicklable entries.
115 del state["cache"]
116 return state
117
118 def __setstate__(self, state):
119 # Restore instance attributes
120 self.__dict__.update(state)
121 self.cache = self._makefile()
122
123
124 class ReadAheadCache(BaseCache):
125 """ Cache which reads only when we get beyond a block of data
126
127 This is a much simpler version of BytesCache, and does not attempt to
128 fill holes in the cache or keep fragments alive. It is best suited to
129 many small reads in a sequential order (e.g., reading lines from a file).
130 """
131
132 def __init__(self, blocksize, fetcher, size):
133 super().__init__(blocksize, fetcher, size)
134 self.cache = b""
135 self.start = 0
136 self.end = 0
137
138 def _fetch(self, start, end):
139 end = min(self.size, end)
140 l = end - start
141 if start >= self.size:
142 return b""
143 elif start >= self.start and end <= self.end:
144 # cache hit
145 return self.cache[start - self.start : end - self.start]
146 elif self.start <= start < self.end:
147 # partial hit
148 part = self.cache[start - self.start :]
149 l -= len(part)
150 start = self.end
151 else:
152 # miss
153 part = b""
154 end = min(self.size, end + self.blocksize)
155 self.cache = self.fetcher(start, end) # new block replaces old
156 self.start = start
157 self.end = self.start + len(self.cache)
158 return part + self.cache[:l]
159
160
161 class BlockCache(BaseCache):
162 """
163 Cache holding memory as a set of blocks.
164
165 Requests are only ever made `blocksize` at a time, and are
166 stored in an LRU cache. The least recently accessed block is
167 discarded when more than `maxblocks` are stored.
168
169 Parameters
170 ----------
171 blocksize : int
172 The number of bytes to store in each block.
173 Requests are only ever made for `blocksize`, so this
174 should balance the overhead of making a request against
175 the granularity of the blocks.
176 fetcher : Callable
177 size : int
178 The total size of the file being cached.
179 maxblocks : int
180 The maximum number of blocks to cache for. The maximum memory
181 use for this cache is then ``blocksize * maxblocks``.
182 """
183
184 def __init__(self, blocksize, fetcher, size, maxblocks=32):
185 super().__init__(blocksize, fetcher, size)
186 self.nblocks = math.ceil(size / blocksize)
187 self.maxblocks = maxblocks
188 self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
189
190 def __repr__(self):
191 return "<BlockCache blocksize={}, size={}, nblocks={}>".format(
192 self.blocksize, self.size, self.nblocks
193 )
194
195 def cache_info(self):
196 """
197 The statistics on the block cache.
198
199 Returns
200 ----------
201 NamedTuple
202 Returned directly from the LRU Cache used internally.
203 """
204 return self._fetch_block_cached.cache_info()
205
206 def __getstate__(self):
207 state = self.__dict__
208 del state["_fetch_block_cached"]
209 return state
210
211 def __setstate__(self, state):
212 self.__dict__.update(state)
213 self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
214 self._fetch_block
215 )
216
217 def _fetch(self, start, end):
218 if end < start:
219 raise ValueError(
220 "'end' ({}) is smaller than 'start' ({}).".format(end, start)
221 )
222
223 if end > self.size:
224 raise ValueError("'end={}' larger than size ('{}')".format(end, self.size))
225
226 # byte position -> block numbers
227 start_block_number = start // self.blocksize
228 end_block_number = end // self.blocksize
229
230 # these are cached, so safe to do multiple calls for the same start and end.
231 for block_number in range(start_block_number, end_block_number + 1):
232 self._fetch_block(block_number)
233
234 return self._read_cache(
235 start,
236 end,
237 start_block_number=start_block_number,
238 end_block_number=end_block_number,
239 )
240
241 def _fetch_block(self, block_number):
242 """
243 Fetch the block of data for `block_number`.
244 """
245 if block_number > self.nblocks:
246 raise ValueError(
247 "'block_number={}' is greater than the number of blocks ({})".format(
248 block_number, self.nblocks
249 )
250 )
251
252 start = block_number * self.blocksize
253 end = start + self.blocksize
254 logger.info("BlockCache fetching block %d", block_number)
255 block_contents = super()._fetch(start, end)
256 return block_contents
257
258 def _read_cache(self, start, end, start_block_number, end_block_number):
259 """
260 Read from our block cache.
261
262 Parameters
263 ----------
264 start, end : int
265 The start and end byte positions.
266 start_block_number, end_block_number : int
267 The start and end block numbers.
268 """
269 start_pos = start % self.blocksize
270 end_pos = end % self.blocksize
271
272 if start_block_number == end_block_number:
273 block = self._fetch_block_cached(start_block_number)
274 return block[start_pos:end_pos]
275
276 else:
277 # read from the initial
278 out = []
279 out.append(self._fetch_block_cached(start_block_number)[start_pos:])
280
281 # intermediate blocks
282 # Note: it'd be nice to combine these into one big request. However
283 # that doesn't play nicely with our LRU cache.
284 for block_number in range(start_block_number + 1, end_block_number):
285 out.append(self._fetch_block_cached(block_number))
286
287 # final block
288 out.append(self._fetch_block_cached(end_block_number)[:end_pos])
289
290 return b"".join(out)
291
292
293 class BytesCache(BaseCache):
294 """Cache which holds data in a in-memory bytes object
295
296 Implements read-ahead by the block size, for semi-random reads progressing
297 through the file.
298
299 Parameters
300 ----------
301 trim: bool
302 As we read more data, whether to discard the start of the buffer when
303 we are more than a blocksize ahead of it.
304 """
305
306 def __init__(self, blocksize, fetcher, size, trim=True):
307 super().__init__(blocksize, fetcher, size)
308 self.cache = b""
309 self.start = None
310 self.end = None
311 self.trim = trim
312
313 def _fetch(self, start, end):
314 # TODO: only set start/end after fetch, in case it fails?
315 # is this where retry logic might go?
316 if (
317 self.start is not None
318 and start >= self.start
319 and self.end is not None
320 and end < self.end
321 ):
322 # cache hit: we have all the required data
323 offset = start - self.start
324 return self.cache[offset : offset + end - start]
325
326 if self.blocksize:
327 bend = min(self.size, end + self.blocksize)
328 else:
329 bend = end
330
331 if bend == start or start > self.size:
332 return b""
333
334 if (self.start is None or start < self.start) and (
335 self.end is None or end > self.end
336 ):
337 # First read, or extending both before and after
338 self.cache = self.fetcher(start, bend)
339 self.start = start
340 elif start < self.start:
341 if self.end - end > self.blocksize:
342 self.cache = self.fetcher(start, bend)
343 self.start = start
344 else:
345 new = self.fetcher(start, self.start)
346 self.start = start
347 self.cache = new + self.cache
348 elif bend > self.end:
349 if self.end > self.size:
350 pass
351 elif end - self.end > self.blocksize:
352 self.cache = self.fetcher(start, bend)
353 self.start = start
354 else:
355 new = self.fetcher(self.end, bend)
356 self.cache = self.cache + new
357
358 self.end = self.start + len(self.cache)
359 offset = start - self.start
360 out = self.cache[offset : offset + end - start]
361 if self.trim:
362 num = (self.end - self.start) // (self.blocksize + 1)
363 if num > 1:
364 self.start += self.blocksize * num
365 self.cache = self.cache[self.blocksize * num :]
366 return out
367
368 def __len__(self):
369 return len(self.cache)
370
371
372 caches = {
373 "none": BaseCache,
374 "mmap": MMapCache,
375 "bytes": BytesCache,
376 "readahead": ReadAheadCache,
377 "block": BlockCache,
378 }
0 """Helper functions for a standard streaming compression API"""
1 from bz2 import BZ2File
2 from gzip import GzipFile
3 from zipfile import ZipFile
4
5 import fsspec.utils
6 from fsspec.spec import AbstractBufferedFile
7
8
9 def noop_file(file, mode, **kwargs):
10 return file
11
12
13 # should be functions of the form func(infile, mode=, **kwargs) -> file-like
14 compr = {None: noop_file}
15
16
17 def register_compression(name, callback, extensions, force=False):
18 """Register an "inferable" file compression type.
19
20 Registers transparent file compression type for use with fsspec.open.
21 Compression can be specified by name in open, or "infer"-ed for any files
22 ending with the given extensions.
23
24 Args:
25 name: (str) The compression type name. Eg. "gzip".
26 callback: A callable of form (infile, mode, **kwargs) -> file-like.
27 Accepts an input file-like object, the target mode and kwargs.
28 Returns a wrapped file-like object.
29 extensions: (str, Iterable[str]) A file extension, or list of file
30 extensions for which to infer this compression scheme. Eg. "gz".
31 force: (bool) Force re-registration of compression type or extensions.
32
33 Raises:
34 ValueError: If name or extensions already registered, and not force.
35
36 """
37 if isinstance(extensions, str):
38 extensions = [extensions]
39
40 # Validate registration
41 if name in compr and not force:
42 raise ValueError("Duplicate compression registration: %s" % name)
43
44 for ext in extensions:
45 if ext in fsspec.utils.compressions and not force:
46 raise ValueError(
47 "Duplicate compression file extension: %s (%s)" % (ext, name)
48 )
49
50 compr[name] = callback
51
52 for ext in extensions:
53 fsspec.utils.compressions[ext] = name
54
55
56 def unzip(infile, mode="rb", filename=None, **kwargs):
57 if "r" not in mode:
58 filename = filename or "file"
59 z = ZipFile(infile, mode="w", **kwargs)
60 fo = z.open(filename, mode="w")
61 fo.close = lambda closer=fo.close: closer() or z.close()
62 return fo
63 z = ZipFile(infile)
64 if filename is None:
65 filename = z.namelist()[0]
66 return z.open(filename, mode="r", **kwargs)
67
68
69 register_compression("zip", unzip, "zip")
70 register_compression("bz2", BZ2File, "bz2")
71 register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz")
72
73 try:
74 import lzma
75
76 register_compression("lzma", lzma.LZMAFile, "xz")
77 register_compression("xz", lzma.LZMAFile, "xz", force=True)
78 except ImportError:
79 pass
80
81 try:
82 import lzmaffi
83
84 register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True)
85 register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
86 except ImportError:
87 pass
88
89
90 class SnappyFile(AbstractBufferedFile):
91 def __init__(self, infile, mode, **kwargs):
92 import snappy
93
94 self.details = {"size": 999999999} # not true, but OK if we don't seek
95 super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs)
96 self.infile = infile
97 if "r" in mode:
98 self.codec = snappy.StreamDecompressor()
99 else:
100 self.codec = snappy.StreamCompressor()
101
102 def _upload_chunk(self, final=False):
103 self.buffer.seek(0)
104 out = self.codec.add_chunk(self.buffer.read())
105 self.infile.write(out)
106 return True
107
108 def seek(self, loc, whence=0):
109 raise NotImplementedError("SnappyFile is not seekable")
110
111 def seekable(self):
112 return False
113
114 def _fetch_range(self, start, end):
115 """Get the specified set of bytes from remote"""
116 data = self.infile.read(end - start)
117 return self.codec.decompress(data)
118
119
120 try:
121 import snappy
122
123 snappy.compress
124 # Snappy may use the .sz file extension, but this is not part of the
125 # standard implementation.
126 register_compression("snappy", SnappyFile, [])
127
128 except (ImportError, NameError):
129 pass
130
131 try:
132 import lz4.frame
133
134 register_compression("lz4", lz4.frame.open, "lz4")
135 except ImportError:
136 pass
137
138 try:
139 import zstandard as zstd
140
141 def zstandard_file(infile, mode="rb"):
142 if "r" in mode:
143 cctx = zstd.ZstdDecompressor()
144 return cctx.stream_reader(infile)
145 else:
146 cctx = zstd.ZstdCompressor(level=10)
147 return cctx.stream_writer(infile)
148
149 register_compression("zstd", zstandard_file, "zst")
150 except ImportError:
151 pass
0 import os
1 import shutil
2 import subprocess
3 import sys
4 import time
5
6 import pytest
7
8 import fsspec
9 from fsspec.implementations.cached import CachingFileSystem
10
11
12 @pytest.fixture()
13 def m():
14 """
15 Fixture providing a memory filesystem.
16 """
17 m = fsspec.filesystem("memory")
18 m.store.clear()
19 try:
20 yield m
21 finally:
22 m.store.clear()
23
24
25 @pytest.fixture
26 def ftp_writable(tmpdir):
27 """
28 Fixture providing a writable FTP filesystem.
29 """
30 pytest.importorskip("pyftpdlib")
31 from fsspec.implementations.ftp import FTPFileSystem
32
33 FTPFileSystem.clear_instance_cache() # remove lingering connections
34 CachingFileSystem.clear_instance_cache()
35 d = str(tmpdir)
36 with open(os.path.join(d, "out"), "wb") as f:
37 f.write(b"hello" * 10000)
38 P = subprocess.Popen(
39 [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
40 )
41 try:
42 time.sleep(1)
43 yield "localhost", 2121, "user", "pass"
44 finally:
45 P.terminate()
46 P.wait()
47 try:
48 shutil.rmtree(tmpdir)
49 except Exception:
50 pass
0 from __future__ import print_function, division, absolute_import
1
2 import io
3 import os
4 import logging
5 from .compression import compr
6 from .utils import (
7 infer_compression,
8 build_name_function,
9 update_storage_options,
10 stringify_path,
11 )
12 from .registry import get_filesystem_class
13
14 # for backwards compat, we export cache things from here too
15 from .caching import ( # noqa: F401
16 BaseCache,
17 MMapCache,
18 ReadAheadCache,
19 BytesCache,
20 BlockCache,
21 caches,
22 )
23
24 logger = logging.getLogger("fsspec")
25
26
27 class OpenFile(object):
28 """
29 File-like object to be used in a context
30
31 Can layer (buffered) text-mode and compression over any file-system, which
32 are typically binary-only.
33
34 These instances are safe to serialize, as the low-level file object
35 is not created until invoked using `with`.
36
37 Parameters
38 ----------
39 fs: FileSystem
40 The file system to use for opening the file. Should match the interface
41 of ``dask.bytes.local.LocalFileSystem``.
42 path: str
43 Location to open
44 mode: str like 'rb', optional
45 Mode of the opened file
46 compression: str or None, optional
47 Compression to apply
48 encoding: str or None, optional
49 The encoding to use if opened in text mode.
50 errors: str or None, optional
51 How to handle encoding errors if opened in text mode.
52 newline: None or str
53 Passed to TextIOWrapper in text mode, how to handle line endings.
54 """
55
56 def __init__(
57 self,
58 fs,
59 path,
60 mode="rb",
61 compression=None,
62 encoding=None,
63 errors=None,
64 newline=None,
65 ):
66 self.fs = fs
67 self.path = path
68 self.mode = mode
69 self.compression = get_compression(path, compression)
70 self.encoding = encoding
71 self.errors = errors
72 self.newline = newline
73 self.fobjects = []
74
75 def __reduce__(self):
76 return (
77 OpenFile,
78 (
79 self.fs,
80 self.path,
81 self.mode,
82 self.compression,
83 self.encoding,
84 self.errors,
85 ),
86 )
87
88 def __repr__(self):
89 return "<OpenFile '{}'>".format(self.path)
90
91 def __fspath__(self):
92 return self.path
93
94 def __enter__(self):
95 mode = self.mode.replace("t", "").replace("b", "") + "b"
96
97 f = self.fs.open(self.path, mode=mode)
98
99 self.fobjects = [f]
100
101 if self.compression is not None:
102 compress = compr[self.compression]
103 f = compress(f, mode=mode[0])
104 self.fobjects.append(f)
105
106 if "b" not in self.mode:
107 # assume, for example, that 'r' is equivalent to 'rt' as in builtin
108 f = io.TextIOWrapper(
109 f, encoding=self.encoding, errors=self.errors, newline=self.newline
110 )
111 self.fobjects.append(f)
112
113 return self.fobjects[-1]
114
115 def __exit__(self, *args):
116 self.close()
117
118 def __del__(self):
119 self.close()
120
121 def open(self):
122 """Materialise this as a real open file without context
123
124 The file should be explicitly closed to avoid enclosed open file
125 instances persisting
126 """
127 return self.__enter__()
128
129 def close(self):
130 """Close all encapsulated file objects"""
131 for f in reversed(self.fobjects):
132 if "r" not in self.mode and not f.closed:
133 f.flush()
134 f.close()
135 self.fobjects = []
136
137
138 def open_files(
139 urlpath,
140 mode="rb",
141 compression=None,
142 encoding="utf8",
143 errors=None,
144 name_function=None,
145 num=1,
146 protocol=None,
147 newline=None,
148 **kwargs
149 ):
150 """ Given a path or paths, return a list of ``OpenFile`` objects.
151
152 For writing, a str path must contain the "*" character, which will be filled
153 in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
154
155 For either reading or writing, can instead provide explicit list of paths.
156
157 Parameters
158 ----------
159 urlpath: string or list
160 Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
161 to read from alternative filesystems. To read from multiple files you
162 can pass a globstring or a list of paths, with the caveat that they
163 must all have the same protocol.
164 mode: 'rb', 'wt', etc.
165 compression: string
166 Compression to use. See ``dask.bytes.compression.files`` for options.
167 encoding: str
168 For text mode only
169 errors: None or str
170 Passed to TextIOWrapper in text mode
171 name_function: function or None
172 if opening a set of files for writing, those files do not yet exist,
173 so we need to generate their names by formatting the urlpath for
174 each sequence number
175 num: int [1]
176 if writing mode, number of files we expect to create (passed to
177 name+function)
178 protocol: str or None
179 If given, overrides the protocol found in the URL.
180 newline: bytes or None
181 Used for line terminator in text mode. If None, uses system default;
182 if blank, uses no translation.
183 **kwargs: dict
184 Extra options that make sense to a particular storage connection, e.g.
185 host, port, username, password, etc.
186
187 Examples
188 --------
189 >>> files = open_files('2015-*-*.csv') # doctest: +SKIP
190 >>> files = open_files(
191 ... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
192 ... ) # doctest: +SKIP
193
194 Returns
195 -------
196 List of ``OpenFile`` objects.
197 """
198 fs, fs_token, paths = get_fs_token_paths(
199 urlpath,
200 mode,
201 num=num,
202 name_function=name_function,
203 storage_options=kwargs,
204 protocol=protocol,
205 )
206 return [
207 OpenFile(
208 fs,
209 path,
210 mode=mode,
211 compression=compression,
212 encoding=encoding,
213 errors=errors,
214 newline=newline,
215 )
216 for path in paths
217 ]
218
219
220 def open(
221 urlpath,
222 mode="rb",
223 compression=None,
224 encoding="utf8",
225 errors=None,
226 protocol=None,
227 newline=None,
228 **kwargs
229 ):
230 """ Given a path or paths, return one ``OpenFile`` object.
231
232 Parameters
233 ----------
234 urlpath: string or list
235 Absolute or relative filepath. Prefix with a protocol like ``s3://``
236 to read from alternative filesystems. Should not include glob
237 character(s).
238 mode: 'rb', 'wt', etc.
239 compression: string
240 Compression to use. See ``dask.bytes.compression.files`` for options.
241 encoding: str
242 For text mode only
243 errors: None or str
244 Passed to TextIOWrapper in text mode
245 protocol: str or None
246 If given, overrides the protocol found in the URL.
247 newline: bytes or None
248 Used for line terminator in text mode. If None, uses system default;
249 if blank, uses no translation.
250 **kwargs: dict
251 Extra options that make sense to a particular storage connection, e.g.
252 host, port, username, password, etc.
253
254 Examples
255 --------
256 >>> openfile = open('2015-01-01.csv') # doctest: +SKIP
257 >>> openfile = open(
258 ... 's3://bucket/2015-01-01.csv.gz',
259 ... compression='gzip'
260 ... ) # doctest: +SKIP
261 >>> with openfile as f:
262 ... df = pd.read_csv(f) # doctest: +SKIP
263
264 Returns
265 -------
266 ``OpenFile`` object.
267 """
268 return open_files(
269 [urlpath],
270 mode,
271 compression,
272 encoding,
273 errors,
274 protocol,
275 newline=newline,
276 **kwargs
277 )[0]
278
279
280 def get_compression(urlpath, compression):
281 if compression == "infer":
282 compression = infer_compression(urlpath)
283 if compression is not None and compression not in compr:
284 raise ValueError("Compression type %s not supported" % compression)
285 return compression
286
287
288 def split_protocol(urlpath):
289 """Return protocol, path pair"""
290 urlpath = stringify_path(urlpath)
291 if "://" in urlpath:
292 protocol, path = urlpath.split("://", 1)
293 if len(protocol) > 1:
294 # excludes Windows paths
295 return protocol, path
296 return None, urlpath
297
298
299 def strip_protocol(urlpath):
300 """Return only path part of full URL, according to appropriate backend"""
301 protocol, _ = split_protocol(urlpath)
302 cls = get_filesystem_class(protocol)
303 return cls._strip_protocol(urlpath)
304
305
306 def expand_paths_if_needed(paths, mode, num, fs, name_function):
307 """Expand paths if they have a ``*`` in them.
308
309 :param paths: list of paths
310 mode: str
311 Mode in which to open files.
312 num: int
313 If opening in writing mode, number of files we expect to create.
314 fs: filesystem object
315 name_function: callable
316 If opening in writing mode, this callable is used to generate path
317 names. Names are generated for each partition by
318 ``urlpath.replace('*', name_function(partition_index))``.
319 :return: list of paths
320 """
321 expanded_paths = []
322 paths = list(paths)
323 if "w" in mode and sum([1 for p in paths if "*" in p]) > 1:
324 raise ValueError("When writing data, only one filename mask can be specified.")
325 elif "w" in mode:
326 num = max(num, len(paths))
327 for curr_path in paths:
328 if "*" in curr_path:
329 if "w" in mode:
330 # expand using name_function
331 expanded_paths.extend(_expand_paths(curr_path, name_function, num))
332 else:
333 # expand using glob
334 expanded_paths.extend(fs.glob(curr_path))
335 else:
336 expanded_paths.append(curr_path)
337 # if we generated more paths that asked for, trim the list
338 if "w" in mode and len(expanded_paths) > num:
339 expanded_paths = expanded_paths[:num]
340 return expanded_paths
341
342
343 def get_fs_token_paths(
344 urlpath, mode="rb", num=1, name_function=None, storage_options=None, protocol=None
345 ):
346 """Filesystem, deterministic token, and paths from a urlpath and options.
347
348 Parameters
349 ----------
350 urlpath: string or iterable
351 Absolute or relative filepath, URL (may include protocols like
352 ``s3://``), or globstring pointing to data.
353 mode: str, optional
354 Mode in which to open files.
355 num: int, optional
356 If opening in writing mode, number of files we expect to create.
357 name_function: callable, optional
358 If opening in writing mode, this callable is used to generate path
359 names. Names are generated for each partition by
360 ``urlpath.replace('*', name_function(partition_index))``.
361 storage_options: dict, optional
362 Additional keywords to pass to the filesystem class.
363 protocol: str or None
364 To override the protocol specifier in the URL
365 """
366 if isinstance(urlpath, (list, tuple)):
367 if not urlpath:
368 raise ValueError("empty urlpath sequence")
369 protocols, paths = zip(*map(split_protocol, urlpath))
370 protocol = protocol or protocols[0]
371 if not all(p == protocol for p in protocols):
372 raise ValueError(
373 "When specifying a list of paths, all paths must "
374 "share the same protocol"
375 )
376 cls = get_filesystem_class(protocol)
377 optionss = list(map(cls._get_kwargs_from_urls, urlpath))
378 paths = [cls._strip_protocol(u) for u in urlpath]
379 options = optionss[0]
380 if not all(o == options for o in optionss):
381 raise ValueError(
382 "When specifying a list of paths, all paths must "
383 "share the same file-system options"
384 )
385 update_storage_options(options, storage_options)
386 fs = cls(**options)
387 paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
388
389 elif isinstance(urlpath, str) or hasattr(urlpath, "name"):
390 protocols, path = split_protocol(urlpath)
391 protocol = protocol or protocols
392 cls = get_filesystem_class(protocol)
393
394 options = cls._get_kwargs_from_urls(urlpath)
395 path = cls._strip_protocol(urlpath)
396 update_storage_options(options, storage_options)
397 fs = cls(**options)
398
399 if "w" in mode:
400 paths = _expand_paths(path, name_function, num)
401 elif "*" in path:
402 paths = sorted(fs.glob(path))
403 else:
404 paths = [path]
405
406 else:
407 raise TypeError("url type not understood: %s" % urlpath)
408
409 return fs, fs._fs_token, paths
410
411
412 def _expand_paths(path, name_function, num):
413 if isinstance(path, str):
414 if path.count("*") > 1:
415 raise ValueError("Output path spec must contain exactly one '*'.")
416 elif "*" not in path:
417 path = os.path.join(path, "*.part")
418
419 if name_function is None:
420 name_function = build_name_function(num - 1)
421
422 paths = [path.replace("*", name_function(i)) for i in range(num)]
423 if paths != sorted(paths):
424 logger.warning(
425 "In order to preserve order between partitions"
426 " paths created with ``name_function`` should "
427 "sort to partition order"
428 )
429 elif isinstance(path, (tuple, list)):
430 assert len(path) == num
431 paths = list(path)
432 else:
433 raise ValueError(
434 "Path should be either\n"
435 "1. A list of paths: ['foo.json', 'bar.json', ...]\n"
436 "2. A directory: 'foo/\n"
437 "3. A path with a '*' in it: 'foo.*.json'"
438 )
439 return paths
0 from __future__ import print_function
1 import os
2 import stat
3 from errno import ENOENT, EIO
4 from fuse import Operations, FuseOSError
5 import threading
6 import time
7 from fuse import FUSE
8
9
10 class FUSEr(Operations):
11 def __init__(self, fs, path):
12 self.fs = fs
13 self.cache = {}
14 self.root = path.rstrip("/") + "/"
15 self.counter = 0
16
17 def getattr(self, path, fh=None):
18 path = "".join([self.root, path.lstrip("/")]).rstrip("/")
19 try:
20 info = self.fs.info(path)
21 except FileNotFoundError:
22 raise FuseOSError(ENOENT)
23 data = {"st_uid": 1000, "st_gid": 1000}
24 perm = 0o777
25
26 if info["type"] != "file":
27 data["st_mode"] = stat.S_IFDIR | perm
28 data["st_size"] = 0
29 data["st_blksize"] = 0
30 else:
31 data["st_mode"] = stat.S_IFREG | perm
32 data["st_size"] = info["size"]
33 data["st_blksize"] = 5 * 2 ** 20
34 data["st_nlink"] = 1
35 data["st_atime"] = time.time()
36 data["st_ctime"] = time.time()
37 data["st_mtime"] = time.time()
38 return data
39
40 def readdir(self, path, fh):
41 path = "".join([self.root, path.lstrip("/")])
42 files = self.fs.ls(path, False)
43 files = [os.path.basename(f.rstrip("/")) for f in files]
44 return [".", ".."] + files
45
46 def mkdir(self, path, mode):
47 path = "".join([self.root, path.lstrip("/")])
48 self.fs.mkdir(path)
49 return 0
50
51 def rmdir(self, path):
52 path = "".join([self.root, path.lstrip("/")])
53 self.fs.rmdir(path)
54 return 0
55
56 def read(self, path, size, offset, fh):
57 f = self.cache[fh]
58 f.seek(offset)
59 out = f.read(size)
60 return out
61
62 def write(self, path, data, offset, fh):
63 f = self.cache[fh]
64 f.write(data)
65 return len(data)
66
67 def create(self, path, flags, fi=None):
68 fn = "".join([self.root, path.lstrip("/")])
69 f = self.fs.open(fn, "wb")
70 self.cache[self.counter] = f
71 self.counter += 1
72 return self.counter - 1
73
74 def open(self, path, flags):
75 fn = "".join([self.root, path.lstrip("/")])
76 if flags % 2 == 0:
77 # read
78 mode = "rb"
79 else:
80 # write/create
81 mode = "wb"
82 self.cache[self.counter] = self.fs.open(fn, mode)
83 self.counter += 1
84 return self.counter - 1
85
86 def truncate(self, path, length, fh=None):
87 fn = "".join([self.root, path.lstrip("/")])
88 if length != 0:
89 raise NotImplementedError
90 # maybe should be no-op since open with write sets size to zero anyway
91 self.fs.touch(fn)
92
93 def unlink(self, path):
94 fn = "".join([self.root, path.lstrip("/")])
95 try:
96 self.fs.rm(fn, False)
97 except (IOError, FileNotFoundError):
98 raise FuseOSError(EIO)
99
100 def release(self, path, fh):
101 try:
102 if fh in self.cache:
103 f = self.cache[fh]
104 f.close()
105 self.cache.pop(fh)
106 except Exception as e:
107 print(e)
108 return 0
109
110 def chmod(self, path, mode):
111 raise NotImplementedError
112
113
114 def run(fs, path, mount_point, foreground=True, threads=False):
115 """ Mount stuff in a local directory
116
117 This uses fusepy to make it appear as if a given path on an fsspec
118 instance is in fact resident within the local file-system.
119
120 This requires that fusepy by installed, and that FUSE be available on
121 the system (typically requiring a package to be installed with
122 apt, yum, brew, etc.).
123
124 Parameters
125 ----------
126 fs: file-system instance
127 From one of the compatible implementations
128 path: str
129 Location on that file-system to regard as the root directory to
130 mount. Note that you typically should include the terminating "/"
131 character.
132 mount_point: str
133 An empty directory on the local file-system where the contents of
134 the remote path will appear
135 foreground: bool
136 Whether or not calling this function will block. Operation will
137 typically be more stable if True.
138 threads: bool
139 Whether or not to create threads when responding to file operations
140 within the mounter directory. Operation will typically be more
141 stable if False.
142
143 """
144 func = lambda: FUSE(
145 FUSEr(fs, path), mount_point, nothreads=not threads, foreground=True
146 )
147 if foreground is False:
148 th = threading.Thread(target=func)
149 th.daemon = True
150 th.start()
151 return th
152 else: # pragma: no cover
153 try:
154 func()
155 except KeyboardInterrupt:
156 pass
0 import time
1 import pickle
2 import logging
3 import os
4 import hashlib
5 import tempfile
6 import inspect
7 from fsspec import AbstractFileSystem, filesystem
8 from fsspec.spec import AbstractBufferedFile
9 from fsspec.core import MMapCache, BaseCache
10
11 logger = logging.getLogger("fsspec")
12
13
14 class CachingFileSystem(AbstractFileSystem):
15 """Locally caching filesystem, layer over any other FS
16
17 This class implements chunk-wise local storage of remote files, for quick
18 access after the initial download. The files are stored in a given
19 directory with random hashes for the filenames. If no directory is given,
20 a temporary one is used, which should be cleaned up by the OS after the
21 process ends. The files themselves as sparse (as implemented in
22 MMapCache), so only the data which is accessed takes up space.
23
24 Restrictions:
25
26 - the block-size must be the same for each access of a given file, unless
27 all blocks of the file have already been read
28 - caching can only be applied to file-systems which produce files
29 derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
30 allowed, for testing
31 """
32
33 protocol = ("blockcache", "cached")
34
35 def __init__(
36 self,
37 target_protocol=None,
38 cache_storage="TMP",
39 cache_check=10,
40 check_files=False,
41 expiry_time=604800,
42 target_options=None,
43 **kwargs
44 ):
45 """
46
47 Parameters
48 ----------
49 target_protocol: str
50 Target fielsystem protocol
51 cache_storage: str or list(str)
52 Location to store files. If "TMP", this is a temporary directory,
53 and will be cleaned up by the OS when this process ends (or later).
54 If a list, each location will be tried in the order given, but
55 only the last will be considered writable.
56 cache_check: int
57 Number of seconds between reload of cache metadata
58 check_files: bool
59 Whether to explicitly see if the UID of the remote file matches
60 the stored one before using. Warning: some file systems such as
61 HTTP cannot reliably give a unique hash of the contents of some
62 path, so be sure to set this option to False.
63 expiry_time: int
64 The time in seconds after which a local copy is considered useless.
65 Set to falsy to prevent expiry. The default is equivalent to one
66 week.
67 target_options: dict or None
68 Passed to the instantiation of the FS, if fs is None.
69 """
70 if self._cached:
71 return
72 super().__init__(**kwargs)
73 if cache_storage == "TMP":
74 storage = [tempfile.mkdtemp()]
75 else:
76 if isinstance(cache_storage, str):
77 storage = [cache_storage]
78 else:
79 storage = cache_storage
80 os.makedirs(storage[-1], exist_ok=True)
81 self.storage = storage
82 self.kwargs = target_options or {}
83 self.cache_check = cache_check
84 self.check_files = check_files
85 self.expiry = expiry_time
86 self.load_cache()
87 if isinstance(target_protocol, AbstractFileSystem):
88 self.fs = target_protocol
89 self.protocol = self.fs.protocol
90 else:
91 self.protocol = target_protocol
92 self.fs = filesystem(target_protocol, **self.kwargs)
93
94 def __reduce_ex__(self, *_):
95 return (
96 self.__class__,
97 (
98 self.protocol,
99 self.storage,
100 self.cache_check,
101 self.check_files,
102 self.expiry,
103 self.kwargs or None,
104 ),
105 )
106
107 def load_cache(self):
108 """Read set of stored blocks from file"""
109 cached_files = []
110 for storage in self.storage:
111 fn = os.path.join(storage, "cache")
112 if os.path.exists(fn):
113 with open(fn, "rb") as f:
114 # TODO: consolidate blocks here
115 cached_files.append(pickle.load(f))
116 else:
117 os.makedirs(storage, exist_ok=True)
118 cached_files.append({})
119 self.cached_files = cached_files or [{}]
120 self.last_cache = time.time()
121
122 def save_cache(self):
123 """Save set of stored blocks from file"""
124 fn = os.path.join(self.storage[-1], "cache")
125 # TODO: a file lock could be used to ensure file does not change
126 # between re-read and write; but occasional duplicated reads ok.
127 cache = self.cached_files[-1]
128 if os.path.exists(fn):
129 with open(fn, "rb") as f:
130 cached_files = pickle.load(f)
131 for k, c in cached_files.items():
132 if c["blocks"] is not True:
133 if cache[k]["blocks"] is True:
134 c["blocks"] = True
135 else:
136 c["blocks"] = set(c["blocks"]).union(cache[k]["blocks"])
137 else:
138 cached_files = cache
139 cache = {k: v.copy() for k, v in cached_files.items()}
140 for c in cache.values():
141 if isinstance(c["blocks"], set):
142 c["blocks"] = list(c["blocks"])
143 with open(fn + ".temp", "wb") as f:
144 pickle.dump(cache, f)
145 if os.path.exists(fn):
146 os.remove(fn)
147 os.rename(fn + ".temp", fn)
148
149 def _check_cache(self):
150 """Reload caches if time elapsed or any disappeared"""
151 if not self.cache_check:
152 # explicitly told not to bother checking
153 return
154 timecond = time.time() - self.last_cache > self.cache_check
155 existcond = all(os.path.exists(storage) for storage in self.storage)
156 if timecond or not existcond:
157 self.load_cache()
158
159 def _check_file(self, path):
160 """Is path in cache and still valid"""
161 self._check_cache()
162 for storage, cache in zip(self.storage, self.cached_files):
163 if path not in cache:
164 continue
165 detail = cache[path].copy()
166 if self.check_files:
167 if detail["uid"] != self.fs.ukey(path):
168 continue
169 if self.expiry:
170 if detail["time"] - time.time() > self.expiry:
171 continue
172 fn = os.path.join(storage, detail["fn"])
173 if os.path.exists(fn):
174 return detail, fn
175 return False, None
176
177 def _open(self, path, mode="rb", **kwargs):
178 """Wrap the target _open
179
180 If the whole file exists in the cache, just open it locally and
181 return that.
182
183 Otherwise, open the file on the target FS, and make it have a mmap
184 cache pointing to the location which we determine, in our cache.
185 The ``blocks`` instance is shared, so as the mmap cache instance
186 updates, so does the entry in our ``cached_files`` attribute.
187 We monkey-patch this file, so that when it closes, we call
188 ``close_and_update`` to save the state of the blocks.
189 """
190 path = self._strip_protocol(path)
191 if not path.startswith(self.protocol):
192 path = self.protocol + "://" + path
193 if mode != "rb":
194 return self.fs._open(path, mode=mode, **kwargs)
195 detail, fn = self._check_file(path)
196 if detail:
197 # file is in cache
198 hash, blocks = detail["fn"], detail["blocks"]
199 if blocks is True:
200 # stored file is complete
201 logger.debug("Opening local copy of %s" % path)
202 return open(fn, "rb")
203 # TODO: action where partial file exists in read-only cache
204 logger.debug("Opening partially cached copy of %s" % path)
205 else:
206 hash = hashlib.sha256(path.encode()).hexdigest()
207 fn = os.path.join(self.storage[-1], hash)
208 blocks = set()
209 detail = {
210 "fn": hash,
211 "blocks": blocks,
212 "time": time.time(),
213 "uid": self.fs.ukey(path),
214 }
215 self.cached_files[-1][path] = detail
216 logger.debug("Creating local sparse file for %s" % path)
217 kwargs["cache_type"] = "none"
218 kwargs["mode"] = mode
219
220 # call target filesystems open
221 f = self.fs._open(path, **kwargs)
222 if "blocksize" in detail:
223 if detail["blocksize"] != f.blocksize:
224 raise ValueError(
225 "Cached file must be reopened with same block"
226 "size as original (old: %i, new %i)"
227 "" % (detail["blocksize"], f.blocksize)
228 )
229 else:
230 detail["blocksize"] = f.blocksize
231 f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
232 close = f.close
233 f.close = lambda: self.close_and_update(f, close)
234 return f
235
236 def close_and_update(self, f, close):
237 """Called when a file is closing, so store the set of blocks"""
238 if f.path.startswith(self.protocol):
239 path = f.path
240 else:
241 path = self.protocol + "://" + f.path
242 c = self.cached_files[-1][path]
243 if c["blocks"] is not True and len(["blocks"]) * f.blocksize >= f.size:
244 c["blocks"] = True
245 self.save_cache()
246 close()
247
248 def __getattribute__(self, item):
249 if item in [
250 "load_cache",
251 "_open",
252 "save_cache",
253 "close_and_update",
254 "__init__",
255 "__getattribute__",
256 "__reduce_ex__",
257 "open",
258 "cat",
259 "get",
260 "read_block",
261 "tail",
262 "head",
263 "_check_file",
264 "_check_cache",
265 ]:
266 # all the methods defined in this class. Note `open` here, since
267 # it calls `_open`, but is actually in superclass
268 return lambda *args, **kw: getattr(type(self), item)(self, *args, **kw)
269 if item == "__class__":
270 return type(self)
271 d = object.__getattribute__(self, "__dict__")
272 fs = d.get("fs", None) # fs is not immediately defined
273 if item in d:
274 return d[item]
275 elif fs is not None:
276 if item in fs.__dict__:
277 # attribute of instance
278 return fs.__dict__[item]
279 # attributed belonging to the target filesystem
280 cls = type(fs)
281 m = getattr(cls, item)
282 if inspect.isfunction(m) and (
283 not hasattr(m, "__self__") or m.__self__ is None
284 ):
285 # instance method
286 return m.__get__(fs, cls)
287 return m # class method or attribute
288 else:
289 # attributes of the superclass, while target is being set up
290 return super().__getattribute__(item)
291
292
293 class WholeFileCacheFileSystem(CachingFileSystem):
294 """Caches whole remote files on first access
295
296 This class is intended as a layer over any other file system, and
297 will make a local copy of each file accessed, so that all subsequent
298 reads are local. This is similar to ``CachingFileSystem``, but without
299 the block-wise functionality and so can work even when sparse files
300 are not allowed. See its docstring for definition of the init
301 arguments.
302
303 The class still needs access to the remote store for listing files,
304 and may refresh cached files.
305 """
306
307 protocol = "filecache"
308
309 def _open(self, path, mode="rb", **kwargs):
310 path = self._strip_protocol(path)
311 if not path.startswith(self.protocol):
312 path = self.protocol + "://" + path
313 if mode != "rb":
314 return self.fs._open(path, mode=mode, **kwargs)
315 detail, fn = self._check_file(path)
316 if detail:
317 hash, blocks = detail["fn"], detail["blocks"]
318 if blocks is True:
319 logger.debug("Opening local copy of %s" % path)
320 return open(fn, "rb")
321 else:
322 raise ValueError(
323 "Attempt to open partially cached file %s"
324 "as a wholly cached file" % path
325 )
326 else:
327 hash = hashlib.sha256(path.encode()).hexdigest()
328 fn = os.path.join(self.storage[-1], hash)
329 blocks = True
330 detail = {
331 "fn": hash,
332 "blocks": blocks,
333 "time": time.time(),
334 "uid": self.fs.ukey(path),
335 }
336 self.cached_files[-1][path] = detail
337 logger.debug("Copying %s to local cache" % path)
338 kwargs["mode"] = mode
339
340 # call target filesystems open
341 # TODO: why not just use fs.get ??
342 f = self.fs._open(path, **kwargs)
343 with open(fn, "wb") as f2:
344 if isinstance(f, AbstractBufferedFile):
345 # want no type of caching if just downloading whole thing
346 f.cache = BaseCache(0, f.cache.fetcher, f.size)
347 if getattr(f, "blocksize", 0) and f.size:
348 # opportunity to parallelise here
349 data = True
350 while data:
351 data = f.read(f.blocksize)
352 f2.write(data)
353 else:
354 # this only applies to HTTP, should instead use streaming
355 f2.write(f.read())
356 self.save_cache()
357 return self._open(path, mode)
0 from distributed.worker import get_worker
1 from distributed.client import _get_global_client
2 import dask
3 from fsspec.spec import AbstractFileSystem, AbstractBufferedFile
4 from fsspec import filesystem
5
6
7 def make_instance(cls, args, kwargs):
8 inst = cls(*args, **kwargs)
9 inst._determine_worker()
10 return inst
11
12
13 class DaskWorkerFileSystem(AbstractFileSystem):
14 """View files accessible to a worker as any other remote file-system
15
16 When instances are run on the worker, uses the real filesystem. When
17 run on the client, they call the worker to provide information or data.
18
19 **Warning** this implementation is experimental, and read-only for now.
20 """
21
22 def __init__(self, remote_protocol, remote_options=None, **kwargs):
23 super().__init__(**kwargs)
24 self.protocol = remote_protocol
25 self.remote_options = remote_options
26 self.worker = None
27 self.client = None
28 self.fs = None
29 self._determine_worker()
30
31 def _determine_worker(self):
32 try:
33 get_worker()
34 self.worker = True
35 self.fs = filesystem(self.protocol, **(self.remote_options or {}))
36 except ValueError:
37 self.worker = False
38 self.client = _get_global_client()
39 self.rfs = dask.delayed(self)
40
41 def __reduce__(self):
42 return make_instance, (type(self), self.storage_args, self.storage_options)
43
44 def mkdir(self, *args, **kwargs):
45 if self.worker:
46 self.fs.mkdir(*args, **kwargs)
47 else:
48 self.rfs.mkdir(*args, **kwargs).compute()
49
50 def rm(self, *args, **kwargs):
51 if self.worker:
52 self.fs.rm(*args, **kwargs)
53 else:
54 self.rfs.rm(*args, **kwargs).compute()
55
56 def copy(self, *args, **kwargs):
57 if self.worker:
58 self.fs.copy(*args, **kwargs)
59 else:
60 self.rfs.copy(*args, **kwargs).compute()
61
62 def mv(self, *args, **kwargs):
63 if self.worker:
64 self.fs.mv(*args, **kwargs)
65 else:
66 self.rfs.mv(*args, **kwargs).compute()
67
68 def ls(self, *args, **kwargs):
69 if self.worker:
70 return self.fs.ls(*args, **kwargs)
71 else:
72 return self.rfs.ls(*args, **kwargs).compute()
73
74 def _open(self, path, mode="rb", **kwargs):
75 if self.worker:
76 return self.fs._open(path, mode=mode)
77 else:
78 return DaskFile(self, path, mode, **kwargs)
79
80 def fetch_range(self, path, mode, start, end):
81 if self.worker:
82 with self._open(path, mode) as f:
83 f.seek(start)
84 return f.read(end - start)
85 else:
86 return self.rfs.fetch_range(path, mode, start, end).compute()
87
88
89 class DaskFile(AbstractBufferedFile):
90 def __init__(
91 self,
92 fs,
93 path,
94 mode="rb",
95 block_size="default",
96 autocommit=True,
97 cache_type="bytes",
98 **kwargs
99 ):
100 super().__init__(
101 fs,
102 path,
103 mode=mode,
104 block_size=block_size,
105 autocommit=autocommit,
106 cache_type=cache_type,
107 **kwargs
108 )
109
110 def _upload_chunk(self, final=False):
111 pass
112
113 def _initiate_upload(self):
114 """ Create remote file/upload """
115 pass
116
117 def _fetch_range(self, start, end):
118 """Get the specified set of bytes from remote"""
119 return self.fs.fetch_range(self.path, self.mode, start, end)
0 from ftplib import FTP, Error, error_perm
1 from socket import timeout
2 import uuid
3 from ..spec import AbstractBufferedFile, AbstractFileSystem
4 from ..utils import infer_storage_options
5
6
7 class FTPFileSystem(AbstractFileSystem):
8 """A filesystem over classic """
9
10 root_marker = "/"
11 cachable = False
12
13 def __init__(
14 self,
15 host,
16 port=21,
17 username=None,
18 password=None,
19 acct=None,
20 block_size=None,
21 tempdir="/tmp",
22 timeout=30,
23 **kwargs
24 ):
25 """
26 You can use _get_kwargs_from_urls to get some kwargs from
27 a reasonable FTP url.
28
29 Authentication will be anonymous if username/password are not
30 given.
31
32 Parameters
33 ----------
34 host: str
35 The remote server name/ip to connect to
36 port: int
37 Port to connect with
38 username: str or None
39 If authenticating, the user's identifier
40 password: str of None
41 User's password on the server, if using
42 acct: str or None
43 Some servers also need an "account" string for auth
44 block_size: int or None
45 If given, the read-ahead or write buffer size.
46 tempdir: str
47 Directory on remote to put temporary files when in a transaction
48 """
49 super(FTPFileSystem, self).__init__(**kwargs)
50 self.host = host
51 self.port = port
52 self.tempdir = tempdir
53 self.cred = username, password, acct
54 self.timeout = timeout
55 if block_size is not None:
56 self.blocksize = block_size
57 else:
58 self.blocksize = 2 ** 16
59 self._connect()
60
61 def _connect(self):
62 self.ftp = FTP(timeout=self.timeout)
63 self.ftp.connect(self.host, self.port)
64 self.ftp.login(*self.cred)
65
66 @classmethod
67 def _strip_protocol(cls, path):
68 return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
69
70 @staticmethod
71 def _get_kwargs_from_urls(urlpath):
72 out = infer_storage_options(urlpath)
73 out.pop("path", None)
74 out.pop("protocol", None)
75 return out
76
77 def invalidate_cache(self, path=None):
78 if path is not None:
79 self.dircache.pop(path, None)
80 else:
81 self.dircache.clear()
82
83 def ls(self, path, detail=True):
84 path = self._strip_protocol(path)
85 out = []
86 if path not in self.dircache:
87 try:
88 try:
89 out = [
90 (fn, details)
91 for (fn, details) in self.ftp.mlsd(path)
92 if fn not in [".", ".."]
93 and details["type"] not in ["pdir", "cdir"]
94 ]
95 except error_perm:
96 out = _mlsd2(self.ftp, path) # Not platform independent
97 for fn, details in out:
98 if path == "/":
99 path = "" # just for forming the names, below
100 details["name"] = "/".join([path, fn.lstrip("/")])
101 if details["type"] == "file":
102 details["size"] = int(details["size"])
103 else:
104 details["size"] = 0
105 self.dircache[path] = out
106 except Error:
107 try:
108 info = self.info(path)
109 if info["type"] == "file":
110 out = [(path, info)]
111 except (Error, IndexError):
112 raise FileNotFoundError
113 files = self.dircache.get(path, out)
114 if not detail:
115 return sorted([fn for fn, details in files])
116 return [details for fn, details in files]
117
118 def info(self, path, **kwargs):
119 # implement with direct method
120 path = self._strip_protocol(path)
121 files = self.ls(self._parent(path).lstrip("/"), True)
122 try:
123 out = [f for f in files if f["name"] == path][0]
124 except IndexError:
125 raise FileNotFoundError(path)
126 return out
127
128 def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs):
129 path = self._strip_protocol(path)
130 block_size = block_size or self.blocksize
131 return FTPFile(
132 self,
133 path,
134 mode=mode,
135 block_size=block_size,
136 tempdir=self.tempdir,
137 autocommit=autocommit,
138 )
139
140 def _rm(self, path):
141 path = self._strip_protocol(path)
142 self.ftp.delete(path)
143 self.invalidate_cache(path.rsplit("/", 1)[0])
144
145 def mkdir(self, path, **kwargs):
146 path = self._strip_protocol(path)
147 self.ftp.mkd(path)
148
149 def rmdir(self, path):
150 path = self._strip_protocol(path)
151 self.ftp.rmd(path)
152
153 def mv(self, path1, path2, **kwargs):
154 path1 = self._strip_protocol(path1)
155 path2 = self._strip_protocol(path2)
156 self.ftp.rename(path1, path2)
157 self.invalidate_cache(self._parent(path1))
158 self.invalidate_cache(self._parent(path2))
159
160 def __del__(self):
161 self.ftp.close()
162
163
164 class TransferDone(Exception):
165 """Internal exception to break out of transfer"""
166
167 pass
168
169
170 class FTPFile(AbstractBufferedFile):
171 """Interact with a remote FTP file with read/write buffering"""
172
173 def __init__(self, fs, path, **kwargs):
174 super().__init__(fs, path, **kwargs)
175 if kwargs.get("autocommit", False) is False:
176 self.target = self.path
177 self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
178
179 def commit(self):
180 self.fs.mv(self.path, self.target)
181
182 def discard(self):
183 self.fs.rm(self.path)
184
185 def _fetch_range(self, start, end):
186 """Get bytes between given byte limits
187
188 Implemented by raising an exception in the fetch callback when the
189 number of bytes received reaches the requested amount.
190
191 Will fail if the server does not respect the REST command on
192 retrieve requests.
193 """
194 out = []
195 total = [0]
196
197 def callback(x):
198 total[0] += len(x)
199 if total[0] > end - start:
200 out.append(x[: (end - start) - total[0]])
201 raise TransferDone
202 else:
203 out.append(x)
204
205 if total[0] == end - start:
206 raise TransferDone
207
208 try:
209 self.fs.ftp.retrbinary(
210 "RETR %s" % self.path,
211 blocksize=self.blocksize,
212 rest=start,
213 callback=callback,
214 )
215 except TransferDone:
216 try:
217 self.fs.ftp.abort()
218 self.fs.ftp.voidresp()
219 except timeout:
220 self.fs._connect()
221 return b"".join(out)
222
223 def _upload_chunk(self, final=False):
224 self.buffer.seek(0)
225 self.fs.ftp.storbinary(
226 "STOR " + self.path, self.buffer, blocksize=self.blocksize, rest=self.offset
227 )
228 return True
229
230
231 def _mlsd2(ftp, path="."):
232 """
233 Fall back to using `dir` instead of `mlsd` if not supported.
234
235 This parses a Linux style `ls -l` response to `dir`, but the response may
236 be platform dependent.
237
238 Parameters
239 ----------
240 ftp: ftplib.FTP
241 path: str
242 Expects to be given path, but defaults to ".".
243 """
244 lines = []
245 minfo = []
246 ftp.dir(path, lines.append)
247 for line in lines:
248 line = line.split()
249 this = (
250 line[-1],
251 {
252 "modify": " ".join(line[5:8]),
253 "unix.owner": line[2],
254 "unix.group": line[3],
255 "unix.mode": line[0],
256 "size": line[4],
257 },
258 )
259 if "d" == this[1]["unix.mode"][0]:
260 this[1]["type"] = "dir"
261 else:
262 this[1]["type"] = "file"
263 minfo.append(this)
264 return minfo
0 import io
1 import requests
2 from ..spec import AbstractFileSystem
3
4
5 class GithubFileSystem(AbstractFileSystem):
6 """[Experimental] interface to files in github
7
8 An instance of this class provides the files residing within a remote github
9 repository. You may specify a point in the repos history, by SHA, branch
10 or tag (default is current master).
11
12 Given that code files tend to be small, and that github does not support
13 retrieving partial content, we always fetch whole files.
14 """
15
16 url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
17 rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
18 protocol = "github"
19
20 def __init__(self, org, repo, sha="master", **kwargs):
21 super().__init__(**kwargs)
22 self.org = org
23 self.repo = repo
24 self.root = sha
25 self.ls("")
26
27 def ls(self, path, detail=False, sha=None, **kwargs):
28 if path == "":
29 sha = self.root
30 if sha is None:
31 parts = path.rstrip("/").split("/")
32 so_far = ""
33 sha = self.root
34 for part in parts:
35 out = self.ls(so_far, True, sha=sha)
36 so_far += "/" + part if so_far else part
37 out = [o for o in out if o["name"] == so_far][0]
38 if out["type"] == "file":
39 if detail:
40 return [out]
41 else:
42 return path
43 sha = out["sha"]
44 if path not in self.dircache:
45 r = requests.get(self.url.format(org=self.org, repo=self.repo, sha=sha))
46 self.dircache[path] = [
47 {
48 "name": path + "/" + f["path"] if path else f["path"],
49 "mode": f["mode"],
50 "type": {"blob": "file", "tree": "directory"}[f["type"]],
51 "size": f.get("size", 0),
52 "sha": f["sha"],
53 }
54 for f in r.json()["tree"]
55 ]
56 if detail:
57 return self.dircache[path]
58 else:
59 return sorted([f["name"] for f in self.dircache[path]])
60
61 def _open(self, path, mode="rb", **kwargs):
62 if mode != "rb":
63 raise NotImplementedError
64 url = self.rurl.format(org=self.org, repo=self.repo, path=path, sha=self.root)
65 r = requests.get(url)
66 return io.BytesIO(r.content)
0 from ..spec import AbstractFileSystem
1 from ..utils import infer_storage_options
2 from pyarrow.hdfs import HadoopFileSystem
3
4
5 class PyArrowHDFS(AbstractFileSystem):
6 """Adapted version of Arrow's HadoopFileSystem
7
8 This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which
9 passes on all calls to the underlying class.
10 """
11
12 def __init__(
13 self,
14 host="default",
15 port=0,
16 user=None,
17 kerb_ticket=None,
18 driver="libhdfs",
19 extra_conf=None,
20 **kwargs
21 ):
22 """
23
24 Parameters
25 ----------
26 host: str
27 Hostname, IP or "default" to try to read from Hadoop config
28 port: int
29 Port to connect on, or default from Hadoop config if 0
30 user: str or None
31 If given, connect as this username
32 kerb_ticket: str or None
33 If given, use this ticket for authentication
34 driver: 'libhdfs' or 'libhdfs3'
35 Binary driver; libhdfs if the JNI library and default
36 extra_conf: None or dict
37 Passed on to HadoopFileSystem
38 """
39 if self._cached:
40 return
41 AbstractFileSystem.__init__(self, **kwargs)
42 self.pars = (host, port, user, kerb_ticket, driver, extra_conf)
43 self.pahdfs = HadoopFileSystem(
44 host=host,
45 port=port,
46 user=user,
47 kerb_ticket=kerb_ticket,
48 driver=driver,
49 extra_conf=extra_conf,
50 )
51
52 def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs):
53 """
54
55 Parameters
56 ----------
57 path: str
58 Location of file; should start with '/'
59 mode: str
60 block_size: int
61 Hadoop block size, e.g., 2**26
62 autocommit: True
63 Transactions are not yet implemented for HDFS; errors if not True
64 kwargs: dict or None
65 Hadoop config parameters
66
67 Returns
68 -------
69 HDFSFile file-like instance
70 """
71 if not autocommit:
72 raise NotImplementedError
73 return HDFSFile(self, path, mode, block_size, **kwargs)
74
75 def __reduce_ex__(self, protocol):
76 return PyArrowHDFS, self.pars
77
78 def ls(self, path, detail=True):
79 out = self.pahdfs.ls(path, detail)
80 if detail:
81 for p in out:
82 p["type"] = p["kind"]
83 p["name"] = self._strip_protocol(p["name"])
84 else:
85 out = [self._strip_protocol(p) for p in out]
86 return out
87
88 @staticmethod
89 def _get_kwargs_from_urls(paths):
90 ops = infer_storage_options(paths)
91 out = {}
92 if ops.get("host", None):
93 out["host"] = ops["host"]
94 if ops.get("username", None):
95 out["user"] = ops["username"]
96 if ops.get("port", None):
97 out["port"] = ops["port"]
98 return out
99
100 @classmethod
101 def _strip_protocol(cls, path):
102 ops = infer_storage_options(path)
103 return ops["path"]
104
105 def __getattribute__(self, item):
106 if item in [
107 "_open",
108 "__init__",
109 "__getattribute__",
110 "__reduce_ex__",
111 "open",
112 "ls",
113 "makedirs",
114 ]:
115 # all the methods defined in this class. Note `open` here, since
116 # it calls `_open`, but is actually in superclass
117 return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw)
118 if item == "__class__":
119 return PyArrowHDFS
120 d = object.__getattribute__(self, "__dict__")
121 pahdfs = d.get("pahdfs", None) # fs is not immediately defined
122 if pahdfs is not None and item in [
123 "chmod",
124 "chown",
125 "user",
126 "df",
127 "disk_usage",
128 "download",
129 "driver",
130 "exists",
131 "extra_conf",
132 "get_capacity",
133 "get_space_used",
134 "host",
135 "is_open",
136 "kerb_ticket",
137 "strip_protocol",
138 "mkdir",
139 "mv",
140 "port",
141 "get_capacity",
142 "get_space_used",
143 "df",
144 "chmod",
145 "chown",
146 "disk_usage",
147 "download",
148 "upload",
149 "_get_kwargs_from_urls",
150 "read_parquet",
151 "rm",
152 "stat",
153 "upload",
154 ]:
155 return getattr(pahdfs, item)
156 else:
157 # attributes of the superclass, while target is being set up
158 return super().__getattribute__(item)
159
160
161 class HDFSFile(object):
162 """Wrapper around arrow's HdfsFile
163
164 Allows seek beyond EOF and (eventually) commit/discard
165 """
166
167 def __init__(self, fs, path, mode, block_size, **kwargs):
168 self.fs = fs
169 self.path = path
170 self.mode = mode
171 self.block_size = block_size
172 self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs)
173 if self.fh.readable():
174 self.seek_size = self.size()
175
176 def seek(self, loc, whence=0):
177 if whence == 0 and self.readable():
178 loc = min(loc, self.seek_size)
179 return self.fh.seek(loc, whence)
180
181 def __getattr__(self, item):
182 return getattr(self.fh, item)
183
184 def __reduce_ex__(self, protocol):
185 return HDFSFile, (self.fs, self.path, self.mode, self.block_size)
186
187 def __enter__(self):
188 return self
189
190 def __exit__(self, exc_type, exc_val, exc_tb):
191 self.close()
0 from __future__ import print_function, division, absolute_import
1
2 import re
3 import requests
4 from urllib.parse import urlparse
5 from fsspec import AbstractFileSystem
6 from fsspec.spec import AbstractBufferedFile
7 from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE
8
9 # https://stackoverflow.com/a/15926317/3821154
10 ex = re.compile(r"""<a\s+(?:[^>]*?\s+)?href=(["'])(.*?)\1""")
11 ex2 = re.compile(r"""(http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
12
13
14 class HTTPFileSystem(AbstractFileSystem):
15 """
16 Simple File-System for fetching data via HTTP(S)
17
18 ``ls()`` is implemented by loading the parent page and doing a regex
19 match on the result. If simple_link=True, anything of the form
20 "http(s)://server.com/stuff?thing=other"; otherwise only links within
21 HTML href tags will be used.
22 """
23
24 sep = "/"
25
26 def __init__(
27 self,
28 simple_links=True,
29 block_size=None,
30 same_scheme=True,
31 size_policy=None,
32 **storage_options
33 ):
34 """
35 Parameters
36 ----------
37 block_size: int
38 Blocks to read bytes; if 0, will default to raw requests file-like
39 objects instead of HTTPFile instances
40 simple_links: bool
41 If True, will consider both HTML <a> tags and anything that looks
42 like a URL; if False, will consider only the former.
43 same_scheme: True
44 When doing ls/glob, if this is True, only consider paths that have
45 http/https matching the input URLs.
46 size_policy: this argument is deprecated
47 storage_options: key-value
48 May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
49 other parameters passed on to requests
50 """
51 AbstractFileSystem.__init__(self)
52 self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
53 self.simple_links = simple_links
54 self.same_schema = same_scheme
55 self.kwargs = storage_options
56 self.session = requests.Session()
57
58 @classmethod
59 def _strip_protocol(cls, path):
60 """ For HTTP, we always want to keep the full URL
61 """
62 return path
63
64 # TODO: override get
65
66 def ls(self, url, detail=True):
67 # ignoring URL-encoded arguments
68 r = self.session.get(url, **self.kwargs)
69 if self.simple_links:
70 links = ex2.findall(r.text) + ex.findall(r.text)
71 else:
72 links = ex.findall(r.text)
73 out = set()
74 parts = urlparse(url)
75 for l in links:
76 if isinstance(l, tuple):
77 l = l[1]
78 if l.startswith("http"):
79 if self.same_schema:
80 if l.split(":", 1)[0] == url.split(":", 1)[0]:
81 out.add(l)
82 elif l.replace("https", "http").startswith(
83 url.replace("https", "http")
84 ):
85 # allowed to cross http <-> https
86 out.add(l)
87 elif l.startswith("/") and len(l) > 1:
88 out.add(parts.scheme + "://" + parts.netloc + l)
89 else:
90 if l not in ["..", "../"]:
91 # Ignore FTP-like "parent"
92 out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
93 if not out and url.endswith("/"):
94 return self.ls(url.rstrip("/"), detail=True)
95 if detail:
96 return [
97 {
98 "name": u,
99 "size": None,
100 "type": "directory" if u.endswith("/") else "file",
101 }
102 for u in out
103 ]
104 else:
105 return list(sorted(out))
106
107 def cat(self, url):
108 r = requests.get(url, **self.kwargs)
109 r.raise_for_status()
110 return r.content
111
112 def mkdirs(self, url):
113 """Make any intermediate directories to make path writable"""
114 raise NotImplementedError
115
116 def exists(self, path):
117 kwargs = self.kwargs.copy()
118 kwargs["stream"] = True
119 try:
120 r = self.session.get(path, **kwargs)
121 r.close()
122 return r.ok
123 except requests.HTTPError:
124 return False
125
126 def _open(self, url, mode="rb", block_size=None, cache_options=None, **kwargs):
127 """Make a file-like object
128
129 Parameters
130 ----------
131 url: str
132 Full URL with protocol
133 mode: string
134 must be "rb"
135 block_size: int or None
136 Bytes to download in one request; use instance value if None. If
137 zero, will return a streaming Requests file-like instance.
138 kwargs: key-value
139 Any other parameters, passed to requests calls
140 """
141 if mode != "rb":
142 raise NotImplementedError
143 block_size = block_size if block_size is not None else self.block_size
144 kw = self.kwargs.copy()
145 kw.update(kwargs)
146 kw.pop("autocommit", None)
147 if block_size:
148 return HTTPFile(
149 self, url, self.session, block_size, cache_options=cache_options, **kw
150 )
151 else:
152 kw["stream"] = True
153 r = self.session.get(url, **kw)
154 r.raise_for_status()
155 r.raw.decode_content = True
156 return r.raw
157
158 def ukey(self, url):
159 """Unique identifier; assume HTTP files are static, unchanging"""
160 return tokenize(url, self.kwargs, self.protocol)
161
162 def info(self, url, **kwargs):
163 """Get info of URL
164
165 Tries to access location via HEAD, and then GET methods, but does
166 not fetch the data.
167
168 It is possible that the server does not supply any size information, in
169 which case size will be given as None (and certain operations on the
170 corresponding file will not work).
171 """
172 size = False
173 for policy in ["head", "get"]:
174 try:
175 size = file_size(url, self.session, policy, **self.kwargs)
176 if size:
177 break
178 except Exception:
179 pass
180 else:
181 # get failed, so conclude URL does not exist
182 if size is False:
183 raise FileNotFoundError(url)
184 return {"name": url, "size": size or None, "type": "file"}
185
186
187 class HTTPFile(AbstractBufferedFile):
188 """
189 A file-like object pointing to a remove HTTP(S) resource
190
191 Supports only reading, with read-ahead of a predermined block-size.
192
193 In the case that the server does not supply the filesize, only reading of
194 the complete file in one go is supported.
195
196 Parameters
197 ----------
198 url: str
199 Full URL of the remote resource, including the protocol
200 session: requests.Session or None
201 All calls will be made within this session, to avoid restarting
202 connections where the server allows this
203 block_size: int or None
204 The amount of read-ahead to do, in bytes. Default is 5MB, or the value
205 configured for the FileSystem creating this file
206 size: None or int
207 If given, this is the size of the file in bytes, and we don't attempt
208 to call the server to find the value.
209 kwargs: all other key-values are passed to requests calls.
210 """
211
212 def __init__(
213 self,
214 fs,
215 url,
216 session=None,
217 block_size=None,
218 mode="rb",
219 cache_type="bytes",
220 cache_options=None,
221 size=None,
222 **kwargs
223 ):
224 if mode != "rb":
225 raise NotImplementedError("File mode not supported")
226 self.url = url
227 self.session = session if session is not None else requests.Session()
228 if size is not None:
229 self.details = {"name": url, "size": size, "type": "file"}
230 super().__init__(
231 fs=fs,
232 path=url,
233 mode=mode,
234 block_size=block_size,
235 cache_type=cache_type,
236 cache_options=cache_options,
237 **kwargs
238 )
239 self.cache.size = self.size or self.blocksize
240
241 def read(self, length=-1):
242 """Read bytes from file
243
244 Parameters
245 ----------
246 length: int
247 Read up to this many bytes. If negative, read all content to end of
248 file. If the server has not supplied the filesize, attempting to
249 read only part of the data will raise a ValueError.
250 """
251 if (
252 (length < 0 and self.loc == 0)
253 or (length > (self.size or length)) # explicit read all
254 or ( # read more than there is
255 self.size and self.size < self.blocksize
256 ) # all fits in one block anyway
257 ):
258 self._fetch_all()
259 if self.size is None:
260 if length < 0:
261 self._fetch_all()
262 else:
263 length = min(self.size - self.loc, length)
264 return super().read(length)
265
266 def _fetch_all(self):
267 """Read whole file in one shot, without caching
268
269 This is only called when position is still at zero,
270 and read() is called without a byte-count.
271 """
272 if not isinstance(self.cache, AllBytes):
273 r = self.session.get(self.url, **self.kwargs)
274 r.raise_for_status()
275 out = r.content
276 self.cache = AllBytes(out)
277 self.size = len(out)
278
279 def _fetch_range(self, start, end):
280 """Download a block of data
281
282 The expectation is that the server returns only the requested bytes,
283 with HTTP code 206. If this is not the case, we first check the headers,
284 and then stream the output - if the data size is bigger than we
285 requested, an exception is raised.
286 """
287 kwargs = self.kwargs.copy()
288 headers = kwargs.pop("headers", {})
289 headers["Range"] = "bytes=%i-%i" % (start, end - 1)
290 r = self.session.get(self.url, headers=headers, stream=True, **kwargs)
291 if r.status_code == 416:
292 # range request outside file
293 return b""
294 r.raise_for_status()
295 if r.status_code == 206:
296 # partial content, as expected
297 out = r.content
298 elif "Content-Length" in r.headers:
299 cl = int(r.headers["Content-Length"])
300 if cl <= end - start:
301 # data size OK
302 out = r.content
303 else:
304 raise ValueError(
305 "Got more bytes (%i) than requested (%i)" % (cl, end - start)
306 )
307 else:
308 cl = 0
309 out = []
310 for chunk in r.iter_content(chunk_size=2 ** 20):
311 # data size unknown, let's see if it goes too big
312 if chunk:
313 out.append(chunk)
314 cl += len(chunk)
315 if cl > end - start:
316 raise ValueError(
317 "Got more bytes so far (>%i) than requested (%i)"
318 % (cl, end - start)
319 )
320 else:
321 break
322 out = b"".join(out)
323 return out
324
325
326 def file_size(url, session=None, size_policy="head", **kwargs):
327 """Call HEAD on the server to get file size
328
329 Default operation is to explicitly allow redirects and use encoding
330 'identity' (no compression) to get the true size of the target.
331 """
332 kwargs = kwargs.copy()
333 ar = kwargs.pop("allow_redirects", True)
334 head = kwargs.get("headers", {}).copy()
335 head["Accept-Encoding"] = "identity"
336 session = session or requests.Session()
337 if size_policy == "head":
338 r = session.head(url, allow_redirects=ar, **kwargs)
339 elif size_policy == "get":
340 kwargs["stream"] = True
341 r = session.get(url, allow_redirects=ar, **kwargs)
342 else:
343 raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy)
344 if "Content-Length" in r.headers:
345 return int(r.headers["Content-Length"])
346 elif "Content-Range" in r.headers:
347 return int(r.headers["Content-Range"].split("/")[1])
348
349
350 class AllBytes(object):
351 """Cache entire contents of a remote URL"""
352
353 def __init__(self, data):
354 self.data = data
355
356 def _fetch(self, start, end):
357 return self.data[start:end]
0 import io
1 import os
2 import shutil
3 import posixpath
4 import re
5 import tempfile
6 from fsspec import AbstractFileSystem
7 from fsspec.utils import stringify_path
8
9
10 class LocalFileSystem(AbstractFileSystem):
11 """Interface to files on local storage
12
13 Parameters
14 ----------
15 auto_mkdirs: bool
16 Whether, when opening a file, the directory containing it should
17 be created (if it doesn't already exist). This is assumed by pyarrow
18 code.
19 """
20
21 root_marker = "/"
22
23 def __init__(self, auto_mkdir=True, **kwargs):
24 super().__init__(**kwargs)
25 self.auto_mkdir = auto_mkdir
26
27 def mkdir(self, path, create_parents=True, **kwargs):
28 path = self._strip_protocol(path)
29 if create_parents:
30 self.makedirs(path, exist_ok=True)
31 else:
32 os.mkdir(path, **kwargs)
33
34 def makedirs(self, path, exist_ok=False):
35 path = self._strip_protocol(path)
36 os.makedirs(path, exist_ok=exist_ok)
37
38 def rmdir(self, path):
39 os.rmdir(path)
40
41 def ls(self, path, detail=False):
42 path = self._strip_protocol(path)
43 paths = [posixpath.join(path, f) for f in os.listdir(path)]
44 if detail:
45 return [self.info(f) for f in paths]
46 else:
47 return paths
48
49 def glob(self, path, **kargs):
50 path = self._strip_protocol(path)
51 return super().glob(path)
52
53 def info(self, path, **kwargs):
54 path = self._strip_protocol(path)
55 out = os.stat(path, follow_symlinks=False)
56 dest = False
57 if os.path.islink(path):
58 t = "link"
59 dest = os.readlink(path)
60 elif os.path.isdir(path):
61 t = "directory"
62 elif os.path.isfile(path):
63 t = "file"
64 else:
65 t = "other"
66 result = {"name": path, "size": out.st_size, "type": t, "created": out.st_ctime}
67 for field in ["mode", "uid", "gid", "mtime"]:
68 result[field] = getattr(out, "st_" + field)
69 if dest:
70 result["destination"] = dest
71 try:
72 out2 = os.stat(path, follow_symlinks=True)
73 result["size"] = out2.st_size
74 except IOError:
75 result["size"] = 0
76 return result
77
78 def copy(self, path1, path2, **kwargs):
79 shutil.copyfile(path1, path2)
80
81 def get(self, path1, path2, **kwargs):
82 if kwargs.get("recursive"):
83 return super(LocalFileSystem, self).get(path1, path2, **kwargs)
84 else:
85 return self.copy(path1, path2, **kwargs)
86
87 def put(self, path1, path2, **kwargs):
88 if kwargs.get("recursive"):
89 return super(LocalFileSystem, self).put(path1, path2, **kwargs)
90 else:
91 return self.copy(path1, path2, **kwargs)
92
93 def mv(self, path1, path2, **kwargs):
94 os.rename(path1, path2)
95
96 def rm(self, path, recursive=False, maxdepth=None):
97 if recursive and self.isdir(path):
98 shutil.rmtree(path)
99 else:
100 os.remove(path)
101
102 def _open(self, path, mode="rb", block_size=None, **kwargs):
103 path = self._strip_protocol(path)
104 if self.auto_mkdir:
105 self.makedirs(self._parent(path), exist_ok=True)
106 return LocalFileOpener(path, mode, fs=self, **kwargs)
107
108 def touch(self, path, **kwargs):
109 path = self._strip_protocol(path)
110 if self.exists(path):
111 os.utime(path, None)
112 else:
113 open(path, "a").close()
114
115 @classmethod
116 def _parent(cls, path):
117 path = cls._strip_protocol(path).rstrip("/")
118 if "/" in path:
119 return path.rsplit("/", 1)[0]
120 else:
121 return cls.root_marker
122
123 @classmethod
124 def _strip_protocol(cls, path):
125 path = stringify_path(path)
126 if path.startswith("file://"):
127 path = path[7:]
128 return make_path_posix(path)
129
130
131 def make_path_posix(path, sep=os.sep):
132 """ Make path generic """
133 if re.match("/[A-Za-z]:", path):
134 # for windows file URI like "file:///C:/folder/file"
135 # or "file:///C:\\dir\\file"
136 path = path[1:]
137 if path.startswith("\\\\"):
138 # special case for windows UNC/DFS-style paths, do nothing,
139 # jsut flip the slashes around (case below does not work!)
140 return path.replace("\\", "/")
141 if path.startswith("\\") or re.match("[\\\\]*[A-Za-z]:", path):
142 # windows full path "\\server\\path" or "C:\\local\\path"
143 return path.lstrip("\\").replace("\\", "/").replace("//", "/")
144 if (
145 sep not in path
146 and "/" not in path
147 or (sep == "/" and not path.startswith("/"))
148 or (sep == "\\" and ":" not in path)
149 ):
150 # relative path like "path" or "rel\\path" (win) or rel/path"
151 path = os.path.abspath(path)
152 if os.sep == "\\":
153 # abspath made some more '\\' separators
154 return make_path_posix(path, sep)
155 return path
156
157
158 class LocalFileOpener(object):
159 def __init__(self, path, mode, autocommit=True, fs=None, **kwargs):
160 self.path = path
161 self.mode = mode
162 self.fs = fs
163 self.f = None
164 self.autocommit = autocommit
165 self.blocksize = io.DEFAULT_BUFFER_SIZE
166 self._open()
167
168 def _open(self):
169 if self.f is None or self.f.closed:
170 if self.autocommit or "w" not in self.mode:
171 self.f = open(self.path, mode=self.mode)
172 else:
173 # TODO: check if path is writable?
174 i, name = tempfile.mkstemp()
175 self.temp = name
176 self.f = open(name, mode=self.mode)
177 if "w" not in self.mode:
178 self.details = self.fs.info(self.path)
179 self.size = self.details["size"]
180 self.f.size = self.size
181
182 def _fetch_range(self, start, end):
183 # probably only used by cached FS
184 if "r" not in self.mode:
185 raise ValueError
186 self._open()
187 self.f.seek(start)
188 return self.f.read(end - start)
189
190 def __setstate__(self, state):
191 if "r" in state["mode"]:
192 loc = self.state.pop("loc")
193 self._open()
194 self.f.seek(loc)
195 else:
196 self.f = None
197 self.__dict__.update(state)
198
199 def __getstate__(self):
200 d = self.__dict__.copy()
201 d.pop("f")
202 if "r" in self.mode:
203 d["loc"] = self.f.tell()
204 else:
205 if not self.f.closed:
206 raise ValueError("Cannot serialise open write-mode local file")
207 return d
208
209 def commit(self):
210 if self.autocommit:
211 raise RuntimeError("Can only commit if not already set to autocommit")
212 os.rename(self.temp, self.path)
213
214 def discard(self):
215 if self.autocommit:
216 raise RuntimeError("Cannot discard if set to autocommit")
217 os.remove(self.temp)
218
219 def __fspath__(self):
220 # uniquely for fsspec implementations, this is a real path
221 return self.path
222
223 def __getattr__(self, item):
224 return getattr(self.f, item)
225
226 def __enter__(self):
227 self._incontext = True
228 return self.f.__enter__()
229
230 def __exit__(self, exc_type, exc_value, traceback):
231 self._incontext = False
232 self.f.__exit__(exc_type, exc_value, traceback)
0 from __future__ import print_function, division, absolute_import
1
2 from io import BytesIO
3 from fsspec import AbstractFileSystem
4 import logging
5
6 logger = logging.Logger("fsspec.memoryfs")
7
8
9 class MemoryFileSystem(AbstractFileSystem):
10 """A filesystem based on a dict of BytesIO objects"""
11
12 store = {} # global
13 pseudo_dirs = []
14 protocol = "memory"
15 root_marker = ""
16
17 def ls(self, path, detail=False):
18 if path in self.store:
19 # there is a key with this exact name, but could also be directory
20 out = [
21 {
22 "name": path,
23 "size": self.store[path].getbuffer().nbytes,
24 "type": "file",
25 }
26 ]
27 else:
28 out = []
29 path = path.strip("/").lstrip("/")
30 paths = set()
31 for p2 in self.store:
32 has_slash = "/" if p2.startswith("/") else ""
33 p = p2.lstrip("/")
34 if "/" in p:
35 root = p.rsplit("/", 1)[0]
36 else:
37 root = ""
38 if root == path:
39 out.append(
40 {
41 "name": has_slash + p,
42 "size": self.store[p2].getbuffer().nbytes,
43 "type": "file",
44 }
45 )
46 elif path and all(
47 (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/"))
48 ):
49 # implicit directory
50 ppath = "/".join(p.split("/")[: len(path.split("/")) + 1])
51 if ppath not in paths:
52 out.append(
53 {
54 "name": has_slash + ppath + "/",
55 "size": 0,
56 "type": "directory",
57 }
58 )
59 paths.add(ppath)
60 elif all(
61 (a == b)
62 for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
63 ):
64 # root directory entry
65 ppath = p.rstrip("/").split("/", 1)[0]
66 if ppath not in paths:
67 out.append(
68 {
69 "name": has_slash + ppath + "/",
70 "size": 0,
71 "type": "directory",
72 }
73 )
74 paths.add(ppath)
75 for p2 in self.pseudo_dirs:
76 if self._parent(p2).strip("/").rstrip("/") == path:
77 out.append({"name": p2 + "/", "size": 0, "type": "directory"})
78 if detail:
79 return out
80 return sorted([f["name"] for f in out])
81
82 def mkdir(self, path):
83 path = path.rstrip("/")
84 if path not in self.pseudo_dirs:
85 self.pseudo_dirs.append(path)
86
87 def rmdir(self, path):
88 path = path.rstrip("/")
89 if path in self.pseudo_dirs:
90 if self.ls(path) == []:
91 self.pseudo_dirs.remove(path)
92 else:
93 raise OSError("Directory %s not empty" % path)
94 else:
95 raise FileNotFoundError(path)
96
97 def exists(self, path):
98 return path in self.store
99
100 def _open(self, path, mode="rb", **kwargs):
101 """Make a file-like object
102
103 Parameters
104 ----------
105 path: str
106 identifier
107 mode: str
108 normally "rb", "wb" or "ab"
109 """
110 if mode in ["rb", "ab", "rb+"]:
111 if path in self.store:
112 f = self.store[path]
113 if mode == "rb":
114 f.seek(0)
115 else:
116 f.seek(0, 2)
117 return f
118 else:
119 raise FileNotFoundError(path)
120 if mode == "wb":
121 m = MemoryFile(self, path)
122 if not self._intrans:
123 m.commit()
124 return m
125
126 def copy(self, path1, path2, **kwargs):
127 self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer())
128
129 def cat(self, path):
130 return self.store[path].getvalue()
131
132 def _rm(self, path):
133 del self.store[path]
134
135 def size(self, path):
136 """Size in bytes of the file at path"""
137 if path not in self.store:
138 raise FileNotFoundError(path)
139 return self.store[path].getbuffer().nbytes
140
141
142 class MemoryFile(BytesIO):
143 """A BytesIO which can't close and works as a context manager
144
145 Can initialise with data
146
147 No need to provide fs, path if auto-committing (default)
148 """
149
150 def __init__(self, fs, path, data=None):
151 self.fs = fs
152 self.path = path
153 if data:
154 self.write(data)
155 self.size = len(data)
156 self.seek(0)
157
158 def __enter__(self):
159 return self
160
161 def close(self):
162 self.size = self.seek(0, 2)
163
164 def discard(self):
165 pass
166
167 def commit(self):
168 self.fs.store[self.path] = self
0 import paramiko
1 from stat import S_ISDIR, S_ISLNK
2 import types
3 import uuid
4 from .. import AbstractFileSystem
5 from ..utils import infer_storage_options
6
7
8 class SFTPFileSystem(AbstractFileSystem):
9 """Files over SFTP/SSH
10
11 Peer-to-peer filesystem over SSH using paramiko.
12 """
13
14 protocol = "sftp", "ssh"
15
16 def __init__(self, host, **ssh_kwargs):
17 """
18
19 Parameters
20 ----------
21 host: str
22 Hostname or IP as a string
23 temppath: str
24 Location on the server to put files, when within a transaction
25 ssh_kwargs: dict
26 Parameters passed on to connection. See details in
27 http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect
28 May include port, username, password...
29 """
30 if self._cached:
31 return
32 super(SFTPFileSystem, self).__init__(**ssh_kwargs)
33 self.temppath = ssh_kwargs.pop("temppath", "/tmp")
34 self.host = host
35 self.ssh_kwargs = ssh_kwargs
36 self._connect()
37
38 def _connect(self):
39 self.client = paramiko.SSHClient()
40 self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
41 self.client.connect(self.host, **self.ssh_kwargs)
42 self.ftp = self.client.open_sftp()
43
44 @classmethod
45 def _strip_protocol(cls, path):
46 return infer_storage_options(path)["path"]
47
48 @staticmethod
49 def _get_kwargs_from_urls(urlpath):
50 out = infer_storage_options(urlpath)
51 out.pop("path", None)
52 out.pop("protocol", None)
53 return out
54
55 def mkdir(self, path, mode=511):
56 self.ftp.mkdir(path, mode)
57
58 def makedirs(self, path, exist_ok=False, mode=511):
59 if self.exists(path) and not exist_ok:
60 raise FileExistsError("File exists: {}".format(path))
61
62 parts = path.split("/")
63 path = ""
64
65 for part in parts:
66 path += "/" + part
67 if not self.exists(path):
68 self.mkdir(path, mode)
69
70 def rmdir(self, path):
71 self.ftp.rmdir(path)
72
73 def info(self, path):
74 s = self.ftp.stat(path)
75 if S_ISDIR(s.st_mode):
76 t = "directory"
77 elif S_ISLNK(s.st_mode):
78 t = "link"
79 else:
80 t = "file"
81 return {
82 "name": path + "/" if t == "directory" else path,
83 "size": s.st_size,
84 "type": t,
85 "uid": s.st_uid,
86 "gui": s.st_gid,
87 "time": s.st_atime,
88 "mtime": s.st_mtime,
89 }
90
91 def ls(self, path, detail=False):
92 out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)]
93 out = [self.info(o) for o in out]
94 if detail:
95 return out
96 return sorted([p["name"] for p in out])
97
98 def put(self, lpath, rpath):
99 self.ftp.put(lpath, rpath)
100
101 def get(self, rpath, lpath):
102 self.ftp.get(rpath, lpath)
103
104 def _open(self, path, mode="rb", block_size=None, **kwargs):
105 """
106 block_size: int or None
107 If 0, no buffering, if 1, line buffering, if >1, buffer that many
108 bytes, if None use default from paramiko.
109 """
110 if kwargs.get("autocommit", True) is False:
111 # writes to temporary file, move on commit
112 path2 = "{}/{}".format(self.temppath, uuid.uuid4())
113 f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
114 f.temppath = path2
115 f.targetpath = path
116 f.fs = self
117 f.commit = types.MethodType(commit_a_file, f)
118 f.discard = types.MethodType(discard_a_file, f)
119 else:
120 f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
121 return f
122
123 def _rm(self, path):
124 if self.isdir(path):
125 self.ftp.rmdir(path)
126 else:
127 self.ftp.remove(path)
128
129 def mv(self, old, new):
130 self.ftp.posix_rename(old, new)
131
132
133 def commit_a_file(self):
134 self.fs.mv(self.temppath, self.targetpath)
135
136
137 def discard_a_file(self):
138 self.fs._rm(self.temppath)
0 import os
1 import shutil
2 import pickle
3 import pytest
4
5 import fsspec
6 from fsspec.implementations.cached import CachingFileSystem
7 from .test_ftp import FTPFileSystem
8
9
10 @pytest.fixture
11 def local_filecache():
12 import tempfile
13
14 original_location = tempfile.mkdtemp()
15 cache_location = tempfile.mkdtemp()
16 original_file = os.path.join(original_location, "afile")
17 data = b"test data"
18 with open(original_file, "wb") as f:
19 f.write(data)
20
21 # we can access the file and read it
22 fs = fsspec.filesystem(
23 "filecache", target_protocol="file", cache_storage=cache_location
24 )
25
26 return (data, original_file, cache_location, fs)
27
28
29 def test_idempotent():
30 fs = CachingFileSystem("file")
31 fs2 = CachingFileSystem("file")
32 assert fs2 is fs
33 fs3 = pickle.loads(pickle.dumps(fs))
34 assert fs3.storage == fs.storage
35
36
37 def test_workflow(ftp_writable):
38 host, port, user, pw = ftp_writable
39 fs = FTPFileSystem(host, port, user, pw)
40 with fs.open("/out", "wb") as f:
41 f.write(b"test")
42 fs = fsspec.filesystem(
43 "cached",
44 target_protocol="ftp",
45 target_options={"host": host, "port": port, "username": user, "password": pw},
46 )
47 assert os.listdir(fs.storage[-1]) == []
48 with fs.open("/out") as f:
49 assert os.listdir(fs.storage[-1])
50 assert f.read() == b"test"
51 assert fs.cached_files[-1]["ftp:///out"]["blocks"]
52 assert fs.cat("/out") == b"test"
53 assert fs.cached_files[-1]["ftp:///out"]["blocks"] is True
54
55 with fs.open("/out", "wb") as f:
56 f.write(b"changed")
57
58 assert fs.cat("/out") == b"test" # old value
59
60
61 def test_blocksize(ftp_writable):
62 host, port, user, pw = ftp_writable
63 fs = FTPFileSystem(host, port, user, pw)
64 with fs.open("/out_block", "wb") as f:
65 f.write(b"test" * 4000)
66
67 fs = fsspec.filesystem(
68 "blockcache",
69 target_protocol="ftp",
70 target_options={"host": host, "port": port, "username": user, "password": pw},
71 )
72
73 with fs.open("/out_block", block_size=20) as f:
74 assert f.read(1) == b"t"
75 with pytest.raises(ValueError):
76 fs.open("/out_block", block_size=30)
77
78
79 def test_local_filecache_creates_dir_if_needed():
80 import tempfile
81
82 original_location = tempfile.mkdtemp()
83 cache_location = "foofoobarbar"
84 assert not os.path.exists(cache_location)
85
86 try:
87 original_file = os.path.join(original_location, "afile")
88 data = b"test data"
89 with open(original_file, "wb") as f:
90 f.write(data)
91
92 # we can access the file and read it
93 fs = fsspec.filesystem(
94 "filecache", target_protocol="file", cache_storage=cache_location
95 )
96
97 with fs.open(original_file, "rb") as f:
98 data_in_cache = f.read()
99
100 assert os.path.exists(cache_location)
101
102 finally:
103 shutil.rmtree(cache_location)
104
105 assert data_in_cache == data
106
107
108 def test_local_filecache_basic(local_filecache):
109 data, original_file, cache_location, fs = local_filecache
110
111 # reading from the file contains the right data
112 with fs.open(original_file, "rb") as f:
113 assert f.read() == data
114 assert "cache" in os.listdir(cache_location)
115
116 # the file in the location contains the right data
117 fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value
118 assert fn in os.listdir(cache_location)
119 with open(os.path.join(cache_location, fn), "rb") as f:
120 assert f.read() == data
121
122 # still there when original file is removed (check=False)
123 os.remove(original_file)
124 with fs.open(original_file, "rb") as f:
125 assert f.read() == data
126
127
128 def test_local_filecache_does_not_change_when_original_data_changed(local_filecache):
129 old_data, original_file, cache_location, fs = local_filecache
130 new_data = b"abc"
131
132 with fs.open(original_file, "rb") as f:
133 assert f.read() == old_data
134
135 with open(original_file, "wb") as f:
136 f.write(new_data)
137
138 with fs.open(original_file, "rb") as f:
139 assert f.read() == old_data
140
141
142 def test_local_filecache_gets_from_original_if_cache_deleted(local_filecache):
143 old_data, original_file, cache_location, fs = local_filecache
144 new_data = b"abc"
145
146 with fs.open(original_file, "rb") as f:
147 assert f.read() == old_data
148
149 with open(original_file, "wb") as f:
150 f.write(new_data)
151
152 shutil.rmtree(cache_location)
153 assert os.path.exists(original_file)
154
155 with open(original_file, "rb") as f:
156 assert f.read() == new_data
157
158 with fs.open(original_file, "rb") as f:
159 assert f.read() == new_data
160
161 # the file in the location contains the right data
162 fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value
163 assert fn in os.listdir(cache_location)
164 with open(os.path.join(cache_location, fn), "rb") as f:
165 assert f.read() == new_data
166
167
168 def test_local_filecache_with_new_cache_location_makes_a_new_copy(local_filecache):
169 import tempfile
170
171 data, original_file, old_cache_location, old_fs = local_filecache
172 new_cache_location = tempfile.mkdtemp()
173
174 with old_fs.open(original_file, "rb") as f:
175 assert f.read() == data
176
177 new_fs = fsspec.filesystem(
178 "filecache", target_protocol="file", cache_storage=new_cache_location
179 )
180
181 with new_fs.open(original_file, "rb") as f:
182 assert f.read() == data
183
184 # the file in the location contains the right data
185 fn = list(new_fs.cached_files[-1].values())[0]["fn"] # this is a hash value
186 assert fn in os.listdir(old_cache_location)
187 assert fn in os.listdir(new_cache_location)
188
189 with open(os.path.join(new_cache_location, fn), "rb") as f:
190 assert f.read() == data
191
192
193 def test_filecache_multicache():
194 import tempfile
195
196 origin = tempfile.mkdtemp()
197 cache1 = tempfile.mkdtemp()
198 cache2 = tempfile.mkdtemp()
199 data = b"test data"
200 f1 = os.path.join(origin, "afile")
201 f2 = os.path.join(origin, "bfile")
202 with open(f1, "wb") as f:
203 f.write(data)
204 with open(f2, "wb") as f:
205 f.write(data * 2)
206
207 # populates first cache
208 fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1)
209 assert fs.cat(f1) == data
210
211 assert len(os.listdir(cache1)) == 2 # cache and hashed afile
212 assert len(os.listdir(cache2)) == 0 # hasn't been intialized yet
213
214 # populates last cache if file not found in first cache
215 fs = fsspec.filesystem(
216 "filecache", target_protocol="file", cache_storage=[cache1, cache2]
217 )
218
219 assert fs.cat(f1) == data
220 assert fs.cat(f2) == data * 2
221
222 assert "cache" in os.listdir(cache1)
223 assert "cache" in os.listdir(cache2)
224
225 cache1_contents = [f for f in os.listdir(cache1) if f != "cache"]
226 assert len(cache1_contents) == 1
227
228 with open(os.path.join(cache1, cache1_contents[0]), "rb") as f:
229 assert f.read() == data
230
231 cache2_contents = [f for f in os.listdir(cache2) if f != "cache"]
232 assert len(cache2_contents) == 1
233
234 with open(os.path.join(cache2, cache2_contents[0]), "rb") as f:
235 assert f.read() == data * 2
236
237
238 def test_filecache_multicache_with_same_file_different_data_reads_from_first():
239 import tempfile
240
241 origin = tempfile.mkdtemp()
242 cache1 = tempfile.mkdtemp()
243 cache2 = tempfile.mkdtemp()
244 data = b"test data"
245 f1 = os.path.join(origin, "afile")
246 with open(f1, "wb") as f:
247 f.write(data)
248
249 # populate first cache
250 fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1)
251 assert fs.cat(f1) == data
252
253 with open(f1, "wb") as f:
254 f.write(data * 2)
255
256 # populate second cache
257 fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache2)
258
259 assert fs.cat(f1) == data * 2
260
261 # the filenames in each cache are the same, but the data is different
262 assert os.listdir(cache1) == os.listdir(cache2)
263
264 fs = fsspec.filesystem(
265 "filecache", target_protocol="file", cache_storage=[cache1, cache2]
266 )
267
268 assert fs.cat(f1) == data
269
270
271 def test_filecache_with_checks():
272 import tempfile
273 import time
274
275 origin = tempfile.mkdtemp()
276 cache1 = tempfile.mkdtemp()
277 data = b"test data"
278 f1 = os.path.join(origin, "afile")
279 with open(f1, "wb") as f:
280 f.write(data)
281
282 # populate first cache
283 fs = fsspec.filesystem(
284 "filecache", target_protocol="file", cache_storage=cache1, expiry_time=0.1
285 )
286 fs2 = fsspec.filesystem(
287 "filecache", target_protocol="file", cache_storage=cache1, check_files=True
288 )
289 assert fs.cat(f1) == data
290 assert fs2.cat(f1) == data
291
292 with open(f1, "wb") as f:
293 f.write(data * 2)
294
295 assert fs.cat(f1) == data # does not change
296 assert fs2.cat(f1) == data * 2 # changed, since origin changed
297 time.sleep(0.11) # allow cache details to expire
298 assert fs.cat(f1) == data * 2 # changed, since origin changed
299
300
301 def test_takes_fs_instance():
302 import tempfile
303
304 origin = tempfile.mkdtemp()
305 data = b"test data"
306 f1 = os.path.join(origin, "afile")
307 with open(f1, "wb") as f:
308 f.write(data)
309
310 fs = fsspec.filesystem("file")
311 fs2 = fsspec.filesystem("filecache", target_protocol=fs)
312
313 assert fs2.cat(f1) == data
0 import pytest
1 import fsspec
2
3 pytest.importorskip("distributed")
4
5
6 @pytest.fixture()
7 def cli(tmpdir):
8 import dask.distributed
9
10 client = dask.distributed.Client(n_workers=1)
11
12 def setup():
13 m = fsspec.filesystem("memory")
14 with m.open("afile", "wb") as f:
15 f.write(b"data")
16
17 client.run(setup)
18 try:
19 yield client
20 finally:
21 client.close()
22
23
24 def test_basic(cli):
25
26 fs = fsspec.filesystem("dask", remote_protocol="memory")
27 assert fs.ls("") == ["afile"]
28 assert fs.cat("afile") == b"data"
0 import os
1 import pytest
2 import subprocess
3 import sys
4 import time
5
6 from fsspec.implementations.ftp import FTPFileSystem
7 from fsspec import open_files
8 import fsspec
9
10 here = os.path.dirname(os.path.abspath(__file__))
11
12
13 @pytest.fixture()
14 def ftp():
15 P = subprocess.Popen(
16 [sys.executable, "-m", "pyftpdlib", "-d", here],
17 stderr=subprocess.STDOUT,
18 stdout=subprocess.PIPE,
19 )
20 try:
21 time.sleep(1)
22 yield "localhost", 2121
23 finally:
24 P.terminate()
25 P.wait()
26
27
28 def test_basic(ftp):
29 host, port = ftp
30 fs = FTPFileSystem(host, port)
31 assert fs.ls("/", detail=False) == sorted(os.listdir(here))
32 out = fs.cat("/" + os.path.basename(__file__))
33 assert out == open(__file__, "rb").read()
34
35
36 def test_not_cached(ftp):
37 host, port = ftp
38 fs = FTPFileSystem(host, port)
39 fs2 = FTPFileSystem(host, port)
40 assert fs is not fs2
41
42
43 @pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
44 def test_complex(ftp_writable, cache_type):
45 from fsspec.core import BytesCache
46
47 host, port, user, pw = ftp_writable
48 files = open_files(
49 "ftp:///ou*",
50 host=host,
51 port=port,
52 username=user,
53 password=pw,
54 block_size=10000,
55 cache_type=cache_type,
56 )
57 assert len(files) == 1
58 with files[0] as fo:
59 assert fo.read(10) == b"hellohello"
60 if isinstance(fo.cache, BytesCache):
61 assert len(fo.cache.cache) == 10010
62 assert fo.read(2) == b"he"
63 assert fo.tell() == 12
64
65
66 def test_write_small(ftp_writable):
67 host, port, user, pw = ftp_writable
68 fs = FTPFileSystem(host, port, user, pw)
69 with fs.open("/out2", "wb") as f:
70 f.write(b"oi")
71 assert fs.cat("/out2") == b"oi"
72
73
74 def test_with_url(ftp_writable):
75 host, port, user, pw = ftp_writable
76 fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "wb")
77 with fo as f:
78 f.write(b"hello")
79 fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "rb")
80 with fo as f:
81 assert f.read() == b"hello"
82
83
84 @pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
85 def test_write_big(ftp_writable, cache_type):
86 host, port, user, pw = ftp_writable
87 fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type)
88 fn = "/bigger"
89 with fs.open(fn, "wb") as f:
90 f.write(b"o" * 500)
91 assert not fs.exists(fn)
92 f.write(b"o" * 1000)
93 fs.invalidate_cache()
94 assert fs.exists(fn)
95 f.write(b"o" * 200)
96 f.flush()
97
98 assert fs.info(fn)["size"] == 1700
99 assert fs.cat(fn) == b"o" * 1700
100
101
102 def test_transaction(ftp_writable):
103 host, port, user, pw = ftp_writable
104 fs = FTPFileSystem(host, port, user, pw)
105 fs.mkdir("/tmp")
106 fn = "/tr"
107 with fs.transaction:
108 with fs.open(fn, "wb") as f:
109 f.write(b"not")
110 assert not fs.exists(fn)
111 assert fs.exists(fn)
112 assert fs.cat(fn) == b"not"
113
114 fs.rm(fn)
115 assert not fs.exists(fn)
0 import pytest
1 from http.server import BaseHTTPRequestHandler, HTTPServer
2 import threading
3 import fsspec
4
5 requests = pytest.importorskip("requests")
6 port = 9898
7 data = b"\n".join([b"some test data"] * 1000)
8 realfile = "http://localhost:%i/index/realfile" % port
9 index = b'<a href="%s">Link</a>' % realfile.encode()
10
11
12 class HTTPTestHandler(BaseHTTPRequestHandler):
13 def _respond(self, code=200, headers=None, data=b""):
14 headers = headers or {}
15 headers.update({"User-Agent": "test"})
16 self.send_response(code)
17 for k, v in headers.items():
18 self.send_header(k, str(v))
19 self.end_headers()
20 if data:
21 self.wfile.write(data)
22
23 def do_GET(self):
24 if self.path not in ["/index/realfile", "/index"]:
25 self._respond(404)
26 return
27
28 d = data if self.path == "/index/realfile" else index
29 if "Range" in self.headers:
30 ran = self.headers["Range"]
31 b, ran = ran.split("=")
32 start, end = ran.split("-")
33 print(start)
34 print(end)
35 d = d[int(start) : int(end) + 1]
36 if "give_length" in self.headers:
37 response_headers = {"Content-Length": len(d)}
38 self._respond(200, response_headers, d)
39 elif "give_range" in self.headers:
40 self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))}, d)
41 else:
42 self._respond(200, data=d)
43
44 def do_HEAD(self):
45 if "head_ok" not in self.headers:
46 self._respond(405)
47 return
48 d = data if self.path == "/index/realfile" else index
49 if self.path not in ["/index/realfile", "/index"]:
50 self._respond(404)
51 elif "give_length" in self.headers:
52 response_headers = {"Content-Length": len(d)}
53 if "zero_length" in self.headers:
54 response_headers["Content-Length"] = 0
55
56 self._respond(200, response_headers)
57 elif "give_range" in self.headers:
58 self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))})
59 else:
60 self._respond(200) # OK response, but no useful info
61
62
63 @pytest.fixture(scope="module")
64 def server():
65 server_address = ("", port)
66 httpd = HTTPServer(server_address, HTTPTestHandler)
67 th = threading.Thread(target=httpd.serve_forever)
68 th.daemon = True
69 th.start()
70 try:
71 yield "http://localhost:%i" % port
72 finally:
73 httpd.socket.close()
74 httpd.shutdown()
75 th.join()
76
77
78 def test_list(server):
79 h = fsspec.filesystem("http")
80 out = h.glob(server + "/index/*")
81 assert out == [server + "/index/realfile"]
82
83
84 def test_policy_arg(server):
85 h = fsspec.filesystem("http", size_policy="get")
86 out = h.glob(server + "/index/*")
87 assert out == [server + "/index/realfile"]
88
89
90 def test_exists(server):
91 h = fsspec.filesystem("http")
92 assert not h.exists(server + "/notafile")
93
94
95 def test_read(server):
96 h = fsspec.filesystem("http")
97 out = server + "/index/realfile"
98 with h.open(out, "rb") as f:
99 assert f.read() == data
100 with h.open(out, "rb", block_size=0) as f:
101 assert f.read() == data
102 with h.open(out, "rb") as f:
103 assert f.read(100) + f.read() == data
104
105
106 def test_methods(server):
107 h = fsspec.filesystem("http")
108 url = server + "/index/realfile"
109 assert h.exists(url)
110 assert h.cat(url) == data
111
112
113 @pytest.mark.parametrize(
114 "headers",
115 [
116 {},
117 {"give_length": "true"},
118 {"give_length": "true", "head_ok": "true"},
119 {"give_range": "true"},
120 ],
121 )
122 def test_random_access(server, headers):
123 h = fsspec.filesystem("http", headers=headers)
124 url = server + "/index/realfile"
125 with h.open(url, "rb") as f:
126 if headers:
127 assert f.size == len(data)
128 assert f.read(5) == data[:5]
129 # python server does not respect bytes range request
130 # we actually get all the data
131 f.seek(5, 1)
132 assert f.read(5) == data[10:15]
133
134
135 def test_mapper_url(server):
136 h = fsspec.filesystem("http")
137 mapper = h.get_mapper(server + "/index/")
138 assert mapper.root.startswith("http:")
139 assert list(mapper)
140
141 mapper2 = fsspec.get_mapper(server + "/index/")
142 assert mapper2.root.startswith("http:")
143 assert list(mapper) == list(mapper2)
144
145
146 def test_content_length_zero(server):
147 h = fsspec.filesystem(
148 "http", headers={"give_length": "true", "zero_length": "true"}
149 )
150 url = server + "/index/realfile"
151
152 with h.open(url, "rb") as f:
153 assert f.read() == data
0 from __future__ import print_function, division, absolute_import
1
2 import gzip
3 import os
4 import os.path
5 import sys
6 from contextlib import contextmanager
7 import tempfile
8
9 import pytest
10 import fsspec
11 from fsspec.core import open_files, get_fs_token_paths, OpenFile
12 from fsspec.implementations.local import LocalFileSystem, make_path_posix
13 from fsspec import compression
14
15 files = {
16 ".test.accounts.1.json": (
17 b'{"amount": 100, "name": "Alice"}\n'
18 b'{"amount": 200, "name": "Bob"}\n'
19 b'{"amount": 300, "name": "Charlie"}\n'
20 b'{"amount": 400, "name": "Dennis"}\n'
21 ),
22 ".test.accounts.2.json": (
23 b'{"amount": 500, "name": "Alice"}\n'
24 b'{"amount": 600, "name": "Bob"}\n'
25 b'{"amount": 700, "name": "Charlie"}\n'
26 b'{"amount": 800, "name": "Dennis"}\n'
27 ),
28 }
29
30
31 csv_files = {
32 ".test.fakedata.1.csv": (b"a,b\n" b"1,2\n"),
33 ".test.fakedata.2.csv": (b"a,b\n" b"3,4\n"),
34 }
35
36
37 @contextmanager
38 def filetexts(d, open=open, mode="t"):
39 """ Dumps a number of textfiles to disk
40
41 d - dict
42 a mapping from filename to text like {'a.csv': '1,1\n2,2'}
43
44 Since this is meant for use in tests, this context manager will
45 automatically switch to a temporary current directory, to avoid
46 race conditions when running tests in parallel.
47 """
48 odir = os.getcwd()
49 dirname = tempfile.mkdtemp()
50 try:
51 os.chdir(dirname)
52 for filename, text in d.items():
53 f = open(filename, "w" + mode)
54 try:
55 f.write(text)
56 finally:
57 try:
58 f.close()
59 except AttributeError:
60 pass
61
62 yield list(d)
63
64 for filename in d:
65 if os.path.exists(filename):
66 try:
67 os.remove(filename)
68 except (IOError, OSError):
69 pass
70 finally:
71 os.chdir(odir)
72
73
74 def test_urlpath_inference_strips_protocol(tmpdir):
75 tmpdir = str(tmpdir)
76 paths = [os.path.join(tmpdir, "test.%02d.csv" % i) for i in range(20)]
77
78 for path in paths:
79 with open(path, "wb") as f:
80 f.write(b"1,2,3\n" * 10)
81
82 # globstring
83 protocol = "file:///" if sys.platform == "win32" else "file://"
84 urlpath = protocol + os.path.join(tmpdir, "test.*.csv")
85 _, _, paths2 = get_fs_token_paths(urlpath)
86 assert paths2 == paths
87
88 # list of paths
89 _, _, paths2 = get_fs_token_paths([protocol + p for p in paths])
90 assert paths2 == paths
91
92
93 def test_urlpath_inference_errors():
94 # Empty list
95 with pytest.raises(ValueError) as err:
96 get_fs_token_paths([])
97 assert "empty" in str(err.value)
98
99 # Protocols differ
100 with pytest.raises(ValueError) as err:
101 get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"])
102 assert "same protocol" in str(err.value)
103
104 # Unknown type
105 with pytest.raises(TypeError):
106 get_fs_token_paths(
107 {"sets/are.csv", "unordered/so/they.csv", "should/not/be.csvallowed.csv"}
108 )
109
110
111 def test_urlpath_expand_read():
112 """Make sure * is expanded in file paths when reading."""
113 # when reading, globs should be expanded to read files by mask
114 with filetexts(csv_files, mode="b"):
115 _, _, paths = get_fs_token_paths("./.*.csv")
116 assert len(paths) == 2
117 _, _, paths = get_fs_token_paths(["./.*.csv"])
118 assert len(paths) == 2
119
120
121 def test_urlpath_expand_write():
122 """Make sure * is expanded in file paths when writing."""
123 _, _, paths = get_fs_token_paths("prefix-*.csv", mode="wb", num=2)
124 assert all(
125 [p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])]
126 )
127 _, _, paths = get_fs_token_paths(["prefix-*.csv"], mode="wb", num=2)
128 assert all(
129 [p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])]
130 )
131 # we can read with multiple masks, but not write
132 with pytest.raises(ValueError):
133 _, _, paths = get_fs_token_paths(
134 ["prefix1-*.csv", "prefix2-*.csv"], mode="wb", num=2
135 )
136
137
138 def test_open_files():
139 with filetexts(files, mode="b"):
140 myfiles = open_files("./.test.accounts.*")
141 assert len(myfiles) == len(files)
142 for lazy_file, data_file in zip(myfiles, sorted(files)):
143 with lazy_file as f:
144 x = f.read()
145 assert x == files[data_file]
146
147
148 @pytest.mark.parametrize("encoding", ["utf-8", "ascii"])
149 def test_open_files_text_mode(encoding):
150 with filetexts(files, mode="b"):
151 myfiles = open_files("./.test.accounts.*", mode="rt", encoding=encoding)
152 assert len(myfiles) == len(files)
153 data = []
154 for file in myfiles:
155 with file as f:
156 data.append(f.read())
157 assert list(data) == [files[k].decode(encoding) for k in sorted(files)]
158
159
160 @pytest.mark.parametrize("mode", ["rt", "rb"])
161 @pytest.mark.parametrize("fmt", list(compression.compr))
162 def test_compressions(fmt, mode, tmpdir):
163 if fmt == "zip" and sys.version_info < (3, 6):
164 pytest.xfail("zip compression requires python3.6 or higher")
165
166 tmpdir = str(tmpdir)
167 fn = os.path.join(tmpdir, ".tmp.getsize")
168 fs = LocalFileSystem()
169 f = OpenFile(fs, fn, compression=fmt, mode="wb")
170 data = b"Long line of readily compressible text"
171 with f as fo:
172 fo.write(data)
173 if fmt is None:
174 assert fs.size(fn) == len(data)
175 else:
176 assert fs.size(fn) != len(data)
177
178 f = OpenFile(fs, fn, compression=fmt, mode=mode)
179 with f as fo:
180 if mode == "rb":
181 assert fo.read() == data
182 else:
183 assert fo.read() == data.decode()
184
185
186 def test_bad_compression():
187 with filetexts(files, mode="b"):
188 for func in [open_files]:
189 with pytest.raises(ValueError):
190 func("./.test.accounts.*", compression="not-found")
191
192
193 def test_not_found():
194 fn = "not-a-file"
195 fs = LocalFileSystem()
196 with pytest.raises((FileNotFoundError, OSError)):
197 with OpenFile(fs, fn, mode="rb"):
198 pass
199
200
201 def test_isfile():
202 fs = LocalFileSystem()
203 with filetexts(files, mode="b"):
204 for f in files.keys():
205 assert fs.isfile(f)
206 assert not fs.isfile("not-a-file")
207
208
209 def test_isdir():
210 fs = LocalFileSystem()
211 with filetexts(files, mode="b"):
212 for f in files.keys():
213 assert fs.isdir(os.path.dirname(os.path.abspath(f)))
214 assert not fs.isdir(f)
215 assert not fs.isdir("not-a-dir")
216
217
218 @pytest.mark.parametrize("compression_opener", [(None, open), ("gzip", gzip.open)])
219 def test_open_files_write(tmpdir, compression_opener):
220 tmpdir = str(tmpdir)
221 compression, opener = compression_opener
222 fn = str(tmpdir) + "/*.part"
223 files = open_files(fn, num=2, mode="wb", compression=compression)
224 assert len(files) == 2
225 assert {f.mode for f in files} == {"wb"}
226 for fil in files:
227 with fil as f:
228 f.write(b"000")
229 files = sorted(os.listdir(tmpdir))
230 assert files == ["0.part", "1.part"]
231
232 with opener(os.path.join(tmpdir, files[0]), "rb") as f:
233 d = f.read()
234 assert d == b"000"
235
236
237 def test_pickability_of_lazy_files(tmpdir):
238 tmpdir = str(tmpdir)
239 cloudpickle = pytest.importorskip("cloudpickle")
240
241 with filetexts(files, mode="b"):
242 myfiles = open_files("./.test.accounts.*")
243 myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles))
244
245 for f, f2 in zip(myfiles, myfiles2):
246 assert f.path == f2.path
247 assert isinstance(f.fs, type(f2.fs))
248 with f as f_open, f2 as f2_open:
249 assert f_open.read() == f2_open.read()
250
251
252 def test_abs_paths(tmpdir):
253 tmpdir = str(tmpdir)
254 here = os.getcwd()
255 os.chdir(tmpdir)
256 with open("tmp", "w") as f:
257 f.write("hi")
258 out = LocalFileSystem().glob("./*")
259 assert len(out) == 1
260 assert os.sep in out[0]
261 assert "tmp" in out[0]
262
263 # I don't know what this was testing - but should avoid local paths anyway
264 # fs = LocalFileSystem()
265 os.chdir(here)
266 # with fs.open('tmp', 'r') as f:
267 # res = f.read()
268 # assert res == 'hi'
269
270
271 @pytest.mark.parametrize("sep", ["/", "\\"])
272 @pytest.mark.parametrize("chars", ["+", "++", "(", ")", "|", "\\"])
273 def test_glob_weird_characters(tmpdir, sep, chars):
274 tmpdir = str(tmpdir)
275
276 subdir = tmpdir + sep + "test" + chars + "x"
277 os.mkdir(subdir)
278 with open(subdir + sep + "tmp", "w") as f:
279 f.write("hi")
280
281 out = LocalFileSystem().glob(subdir + sep + "*")
282 assert len(out) == 1
283 assert os.sep in out[0]
284 assert "tmp" in out[0]
285
286
287 def test_globfind_dirs(tmpdir):
288 tmpdir = str(tmpdir)
289 fs = fsspec.filesystem("file")
290 fs.mkdir(tmpdir + "/dir")
291 fs.touch(tmpdir + "/dir/afile")
292 assert [tmpdir + "/dir"] == fs.glob(tmpdir + "/*")
293 assert [tmpdir + "/dir/afile"] == fs.find(tmpdir)
294 assert [tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(tmpdir, withdirs=True)
295
296
297 def test_get_pyarrow_filesystem():
298 pa = pytest.importorskip("pyarrow")
299
300 fs = LocalFileSystem()
301 assert isinstance(fs, pa.filesystem.FileSystem)
302 assert fs._get_pyarrow_filesystem() is fs
303
304 class UnknownFileSystem(object):
305 pass
306
307 assert not isinstance(UnknownFileSystem(), pa.filesystem.FileSystem)
308
309
310 def test_directories(tmpdir):
311 tmpdir = str(tmpdir)
312 fs = LocalFileSystem()
313 fs.mkdir(tmpdir + "/dir")
314 assert tmpdir + "/dir" in fs.ls(tmpdir)
315 assert fs.ls(tmpdir, True)[0]["type"] == "directory"
316 fs.rmdir(tmpdir + "/dir")
317 assert not fs.ls(tmpdir)
318
319
320 def test_file_ops(tmpdir):
321 tmpdir = str(tmpdir)
322 fs = LocalFileSystem()
323 with pytest.raises(FileNotFoundError):
324 fs.info(tmpdir + "/nofile")
325 fs.touch(tmpdir + "/afile")
326 i1 = fs.ukey(tmpdir + "/afile")
327
328 assert tmpdir + "/afile" in fs.ls(tmpdir)
329
330 with fs.open(tmpdir + "/afile", "wb") as f:
331 f.write(b"data")
332 i2 = fs.ukey(tmpdir + "/afile")
333 assert i1 != i2 # because file changed
334
335 fs.copy(tmpdir + "/afile", tmpdir + "/afile2")
336 assert tmpdir + "/afile2" in fs.ls(tmpdir)
337
338 fs.move(tmpdir + "/afile", tmpdir + "/afile3")
339 assert not fs.exists(tmpdir + "/afile")
340
341 fs.rm(tmpdir + "/afile3", recursive=True)
342 assert not fs.exists(tmpdir + "/afile3")
343
344 fs.rm(tmpdir, recursive=True)
345 assert not fs.exists(tmpdir)
346
347
348 def test_recursive_get_put(tmpdir):
349 tmpdir = str(tmpdir)
350 fs = LocalFileSystem()
351
352 fs.mkdir(tmpdir + "/a1/a2/a3")
353 fs.touch(tmpdir + "/a1/a2/a3/afile")
354 fs.touch(tmpdir + "/a1/afile")
355
356 fs.get("file://{0}/a1".format(tmpdir), tmpdir + "/b1", recursive=True)
357 assert fs.isfile(tmpdir + "/b1/afile")
358 assert fs.isfile(tmpdir + "/b1/a2/a3/afile")
359
360 fs.put(tmpdir + "/b1", "file://{0}/c1".format(tmpdir), recursive=True)
361 assert fs.isfile(tmpdir + "/c1/afile")
362 assert fs.isfile(tmpdir + "/c1/a2/a3/afile")
363
364
365 def test_commit_discard(tmpdir):
366 tmpdir = str(tmpdir)
367 fs = LocalFileSystem()
368 with fs.transaction:
369 with fs.open(tmpdir + "/afile", "wb") as f:
370 assert not fs.exists(tmpdir + "/afile")
371 f.write(b"data")
372 assert not fs.exists(tmpdir + "/afile")
373
374 assert fs._transaction is None
375 assert fs.cat(tmpdir + "/afile") == b"data"
376
377 try:
378 with fs.transaction:
379 with fs.open(tmpdir + "/bfile", "wb") as f:
380 f.write(b"data")
381 raise KeyboardInterrupt
382 except KeyboardInterrupt:
383 assert not fs.exists(tmpdir + "/bfile")
384
385
386 def test_make_path_posix():
387 cwd = os.getcwd()
388 assert make_path_posix("/a/posix/path") == "/a/posix/path"
389 assert make_path_posix("/posix") == "/posix"
390 assert make_path_posix("relpath", sep="/") == os.path.join(cwd, "relpath")
391 assert make_path_posix("rel/path", sep="/") == os.path.join(cwd, "rel/path")
392 assert make_path_posix("C:\\path", sep="\\") == "C:/path"
393 assert (
394 make_path_posix(
395 "\\\\windows-server\\someshare\\path\\more\\path\\dir\\foo.parquet"
396 )
397 == "//windows-server/someshare/path/more/path/dir/foo.parquet"
398 )
399 assert "/" in make_path_posix("rel\\path", sep="\\")
400
401
402 def test_links(tmpdir):
403 tmpdir = str(tmpdir)
404 fn0 = os.path.join(tmpdir, "target")
405 fn1 = os.path.join(tmpdir, "link1")
406 fn2 = os.path.join(tmpdir, "link2")
407 data = b"my target data"
408 with open(fn0, "wb") as f:
409 f.write(data)
410 os.symlink(fn0, fn1)
411 os.symlink(fn0, fn2)
412
413 fs = LocalFileSystem()
414 assert fs.info(fn0)["type"] == "file"
415 assert fs.info(fn1)["type"] == "link"
416 assert fs.info(fn2)["type"] == "link"
417
418 assert fs.info(fn0)["size"] == len(data)
419 assert fs.info(fn1)["size"] == len(data)
420 assert fs.info(fn2)["size"] == len(data)
421
422 of = fsspec.open(fn1, "rb")
423 with of as f:
424 assert f.read() == data
425
426 of = fsspec.open(fn2, "rb")
427 with of as f:
428 assert f.read() == data
0 import pytest
1 import sys
2
3
4 def test_1(m):
5 m.touch("/somefile") # NB: is found with or without initial /
6 m.touch("afiles/and/anothers")
7 assert m.find("") == ["afiles/and/anothers", "somefile"]
8 assert list(m.get_mapper("")) == ["afiles/and/anothers", "somefile"]
9
10
11 @pytest.mark.xfail(
12 sys.version_info < (3, 6),
13 reason="py35 error, see https://github.com/intake/filesystem_spec/issues/148",
14 )
15 def test_ls(m):
16 m.touch("/dir/afile")
17 m.touch("/dir/dir1/bfile")
18 m.touch("/dir/dir1/cfile")
19
20 assert m.ls("/", False) == ["/dir/"]
21 assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1/"]
22 assert m.ls("/dir", True)[0]["type"] == "file"
23 assert m.ls("/dir", True)[1]["type"] == "directory"
24
25 assert len(m.ls("/dir/dir1")) == 2
0 import pytest
1 import shlex
2 import subprocess
3 import time
4 import fsspec
5
6 pytest.importorskip("paramiko")
7
8
9 def stop_docker(name):
10 cmd = shlex.split('docker ps -a -q --filter "name=%s"' % name)
11 cid = subprocess.check_output(cmd).strip().decode()
12 if cid:
13 subprocess.call(["docker", "rm", "-f", cid])
14
15
16 @pytest.fixture(scope="module")
17 def ssh():
18 try:
19 subprocess.check_call(["docker", "run", "hello-world"])
20 except subprocess.CalledProcessError:
21 pytest.skip("docker run not available")
22 return
23
24 # requires docker
25 cmds = [
26 r"apt-get update",
27 r"apt-get install -y openssh-server",
28 r"mkdir /var/run/sshd",
29 "bash -c \"echo 'root:pass' | chpasswd\"",
30 (
31 r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' "
32 r"/etc/ssh/sshd_config"
33 ),
34 (
35 r"sed 's@session\s*required\s*pam_loginuid.so@session optional "
36 r"pam_loginuid.so@g' -i /etc/pam.d/sshd"
37 ),
38 r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"',
39 r"/usr/sbin/sshd",
40 ]
41 name = "fsspec_sftp"
42 stop_docker(name)
43 cmd = "docker run -d -p 9200:22 --name {} ubuntu:16.04 sleep 9000".format(name)
44 cid = subprocess.check_output(shlex.split(cmd)).strip().decode()
45 for cmd in cmds:
46 subprocess.call(["docker", "exec", cid] + shlex.split(cmd))
47 try:
48 time.sleep(1)
49 yield dict(host="localhost", port=9200, username="root", password="pass")
50 finally:
51 stop_docker(name)
52
53
54 def test_simple(ssh):
55 f = fsspec.get_filesystem_class("sftp")(**ssh)
56 f.mkdirs("/home/someuser/deeper")
57 f.touch("/home/someuser/deeper/afile")
58 assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
59 assert f.ls("/home/someuser/deeper/") == ["/home/someuser/deeper/afile"]
60 assert f.info("/home/someuser/deeper/afile")["type"] == "file"
61 assert f.info("/home/someuser/deeper/afile")["size"] == 0
62 assert f.exists("/home/someuser")
63 f.rm("/home/someuser", recursive=True)
64 assert not f.exists("/home/someuser")
65
66
67 @pytest.mark.parametrize("protocol", ["sftp", "ssh"])
68 def test_with_url(protocol, ssh):
69 fo = fsspec.open(
70 protocol + "://{username}:{password}@{host}:{port}"
71 "/home/someuserout".format(**ssh),
72 "wb",
73 )
74 with fo as f:
75 f.write(b"hello")
76 fo = fsspec.open(
77 protocol + "://{username}:{password}@{host}:{port}"
78 "/home/someuserout".format(**ssh),
79 "rb",
80 )
81 with fo as f:
82 assert f.read() == b"hello"
83
84
85 def test_transaction(ssh):
86 f = fsspec.get_filesystem_class("sftp")(**ssh)
87 f.mkdirs("/home/someuser/deeper")
88 f.start_transaction()
89 f.touch("/home/someuser/deeper/afile")
90 assert f.find("/home/someuser") == []
91 f.end_transaction()
92 f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
93
94 with f.transaction:
95 assert f._intrans
96 f.touch("/home/someuser/deeper/afile2")
97 assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
98 assert f.find("/home/someuser") == [
99 "/home/someuser/deeper/afile",
100 "/home/someuser/deeper/afile2",
101 ]
102
103
104 def test_makedirs_exist_ok(ssh):
105 f = fsspec.get_filesystem_class("sftp")(**ssh)
106
107 f.makedirs("/a/b/c")
108
109 with pytest.raises(FileExistsError, match="/a/b/c"):
110 f.makedirs("/a/b/c", exist_ok=False)
111
112 f.makedirs("/a/b/c", exist_ok=True)
0 import pickle
1 import pytest
2 import subprocess
3 import time
4 import fsspec
5
6 requests = pytest.importorskip("requests")
7
8 from fsspec.implementations.webhdfs import WebHDFS # noqa: E402
9
10
11 @pytest.fixture(scope="module")
12 def hdfs_cluster():
13 cmd0 = "htcluster shutdown".split()
14 try:
15 subprocess.check_output(cmd0, stderr=subprocess.STDOUT)
16 except FileNotFoundError:
17 pytest.skip("htcluster not found")
18 except subprocess.CalledProcessError as ex:
19 pytest.skip("htcluster failed: " + ex.output.decode())
20 cmd1 = "htcluster startup --image base".split()
21 subprocess.check_output(cmd1)
22 try:
23 while True:
24 t = 90
25 try:
26 requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS")
27 except: # noqa: E722
28 t -= 1
29 assert t > 0, "Timeout waiting for HDFS"
30 time.sleep(1)
31 continue
32 break
33 time.sleep(7)
34 yield "localhost"
35 finally:
36 subprocess.check_output(cmd0)
37
38
39 def test_pickle(hdfs_cluster):
40 w = WebHDFS(hdfs_cluster, user="testuser")
41 w2 = pickle.loads(pickle.dumps(w))
42 assert w == w2
43
44
45 def test_simple(hdfs_cluster):
46 w = WebHDFS(hdfs_cluster, user="testuser")
47 home = w.home_directory()
48 assert home == "/user/testuser"
49 with pytest.raises(PermissionError):
50 w.mkdir("/root")
51
52
53 def test_url(hdfs_cluster):
54 url = "webhdfs://testuser@localhost:50070/user/testuser/myfile"
55 fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"})
56 with fo as f:
57 f.write(b"hello")
58 fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"})
59 with fo as f:
60 assert f.read() == b"hello"
61
62
63 def test_workflow(hdfs_cluster):
64 w = WebHDFS(
65 hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
66 )
67 fn = "/user/testuser/testrun/afile"
68 w.mkdir("/user/testuser/testrun")
69 with w.open(fn, "wb") as f:
70 f.write(b"hello")
71 assert w.exists(fn)
72 info = w.info(fn)
73 assert info["size"] == 5
74 assert w.isfile(fn)
75 assert w.cat(fn) == b"hello"
76 w.rm("/user/testuser/testrun", recursive=True)
77 assert not w.exists(fn)
78
79
80 def test_with_gzip(hdfs_cluster):
81 from gzip import GzipFile
82
83 w = WebHDFS(
84 hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
85 )
86 fn = "/user/testuser/gzfile"
87 with w.open(fn, "wb") as f:
88 gf = GzipFile(fileobj=f, mode="w")
89 gf.write(b"hello")
90 gf.close()
91 with w.open(fn, "rb") as f:
92 gf = GzipFile(fileobj=f, mode="r")
93 assert gf.read() == b"hello"
94
95
96 def test_workflow_transaction(hdfs_cluster):
97 w = WebHDFS(
98 hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
99 )
100 fn = "/user/testuser/testrun/afile"
101 w.mkdirs("/user/testuser/testrun")
102 with w.transaction:
103 with w.open(fn, "wb") as f:
104 f.write(b"hello")
105 assert not w.exists(fn)
106 assert w.exists(fn)
107 assert w.ukey(fn)
108 files = w.ls("/user/testuser/testrun", True)
109 summ = w.content_summary("/user/testuser/testrun")
110 assert summ["length"] == files[0]["size"]
111 assert summ["fileCount"] == 1
112
113 w.rm("/user/testuser/testrun", recursive=True)
114 assert not w.exists(fn)
0 import zipfile
1 from contextlib import contextmanager
2 import os
3 import pickle
4 import pytest
5 import sys
6 import tempfile
7 import fsspec
8
9
10 @contextmanager
11 def tempzip(data={}):
12 f = tempfile.mkstemp(suffix="zip")[1]
13 with zipfile.ZipFile(f, mode="w") as z:
14 for k, v in data.items():
15 z.writestr(k, v)
16 try:
17 yield f
18 finally:
19 try:
20 os.remove(f)
21 except (IOError, OSError):
22 pass
23
24
25 data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
26
27
28 def test_empty():
29 with tempzip() as z:
30 fs = fsspec.get_filesystem_class("zip")(fo=z)
31 assert fs.find("") == []
32
33
34 @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip-info odd on py35")
35 def test_mapping():
36 with tempzip(data) as z:
37 fs = fsspec.get_filesystem_class("zip")(fo=z)
38 m = fs.get_mapper("")
39 assert list(m) == ["a", "b", "deeply/nested/path"]
40 assert m["b"] == data["b"]
41
42
43 @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip not supported on py35")
44 def test_pickle():
45 with tempzip(data) as z:
46 fs = fsspec.get_filesystem_class("zip")(fo=z)
47 fs2 = pickle.loads(pickle.dumps(fs))
48 assert fs2.cat("b") == b"hello"
0 # https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
1
2 import requests
3 from urllib.parse import quote
4 import uuid
5 from ..spec import AbstractFileSystem, AbstractBufferedFile
6 from ..utils import infer_storage_options
7 import logging
8
9 logger = logging.getLogger("webhdfs")
10
11
12 class WebHDFS(AbstractFileSystem):
13 """
14 Interface to HDFS over HTTP
15
16 Three auth mechanisms are supported:
17
18 insecure: no auth is done, and the user is assumed to be whoever they
19 say they are (parameter `user`), or a predefined value such as
20 "dr.who" if not given
21 spnego: when kerberos authentication is enabled, auth is negotiated by
22 requests_kerberos https://github.com/requests/requests-kerberos .
23 This establishes a session based on existing kinit login and/or
24 specified principal/password; paraneters are passed with ``kerb_kwargs``
25 token: uses an existing Hadoop delegation token from another secured
26 service. Indeed, this client can also generate such tokens when
27 not insecure. Note that tokens expire, but can be renewed (by a
28 previously specified user) and may allow for proxying.
29
30 """
31
32 tempdir = "/tmp"
33 protocol = "webhdfs", "webHDFS"
34
35 def __init__(
36 self,
37 host,
38 port=50070,
39 kerberos=False,
40 token=None,
41 user=None,
42 proxy_to=None,
43 kerb_kwargs=None,
44 data_proxy=None,
45 **kwargs
46 ):
47 """
48 Parameters
49 ----------
50 host: str
51 Name-node address
52 port: int
53 Port for webHDFS
54 kerberos: bool
55 Whether to authenticate with kerberos for this connection
56 token: str or None
57 If given, use this token on every call to authenticate. A user
58 and user-proxy may be encoded in the token and should not be also
59 given
60 user: str or None
61 If given, assert the user name to connect with
62 proxy_to: str or None
63 If given, the user has the authority to proxy, and this value is
64 the user in who's name actions are taken
65 kerb_kwargs: dict
66 Any extra arguments for HTTPKerberosAuth, see
67 https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py
68 data_proxy: dict, callable or None
69 If given, map data-node addresses. This can be necessary if the
70 HDFS cluster is behind a proxy, running on Docker or otherwise has
71 a mismatch between the host-names given by the name-node and the
72 address by which to refer to them from the client. If a dict,
73 maps host names `host->data_proxy[host]`; if a callable, full
74 URLs are passed, and function must conform to
75 `url->data_proxy(url)`.
76 kwargs
77 """
78 if self._cached:
79 return
80 super().__init__(**kwargs)
81 self.url = "http://{host}:{port}/webhdfs/v1".format(host=host, port=port)
82 self.kerb = kerberos
83 self.kerb_kwargs = kerb_kwargs or {}
84 self.pars = {}
85 self.proxy = data_proxy or {}
86 if token is not None:
87 if user is not None or proxy_to is not None:
88 raise ValueError(
89 "If passing a delegation token, must not set "
90 "user or proxy_to, as these are encoded in the"
91 " token"
92 )
93 self.pars["delegation"] = token
94 if user is not None:
95 self.pars["user.name"] = user
96 if proxy_to is not None:
97 self.pars["doas"] = proxy_to
98 if kerberos and user is not None:
99 raise ValueError(
100 "If using Kerberos auth, do not specify the "
101 "user, this is handled by kinit."
102 )
103 self._connect()
104
105 def _connect(self):
106 self.session = requests.Session()
107 if self.kerb:
108 from requests_kerberos import HTTPKerberosAuth
109
110 self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
111
112 def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
113 url = self.url + quote(path or "")
114 args = kwargs.copy()
115 args.update(self.pars)
116 args["op"] = op.upper()
117 logger.debug(url, method, args)
118 out = self.session.request(
119 method=method.upper(),
120 url=url,
121 params=args,
122 data=data,
123 allow_redirects=redirect,
124 )
125 if out.status_code == 404:
126 raise FileNotFoundError(path)
127 if out.status_code == 403:
128 raise PermissionError(path or "")
129 if out.status_code == 401:
130 raise PermissionError # not specific to path
131 out.raise_for_status()
132 return out
133
134 def _open(
135 self,
136 path,
137 mode="rb",
138 block_size=None,
139 autocommit=True,
140 replication=None,
141 permissions=None,
142 **kwargs
143 ):
144 """
145
146 Parameters
147 ----------
148 path: str
149 File location
150 mode: str
151 'rb', 'wb', etc.
152 block_size: int
153 Client buffer size for read-ahead or write buffer
154 autocommit: bool
155 If False, writes to temporary file that only gets put in final
156 location upon commit
157 replication: int
158 Number of copies of file on the cluster, write mode only
159 permissions: str or int
160 posix permissions, write mode only
161 kwargs
162
163 Returns
164 -------
165 WebHDFile instance
166 """
167 block_size = block_size or self.blocksize
168 return WebHDFile(
169 self,
170 path,
171 mode=mode,
172 block_size=block_size,
173 tempdir=self.tempdir,
174 autocommit=autocommit,
175 replication=replication,
176 permissions=permissions,
177 )
178
179 @staticmethod
180 def _process_info(info):
181 info["type"] = info["type"].lower()
182 info["size"] = info["length"]
183 return info
184
185 @classmethod
186 def _strip_protocol(cls, path):
187 return infer_storage_options(path)["path"]
188
189 @staticmethod
190 def _get_kwargs_from_urls(urlpath):
191 out = infer_storage_options(urlpath)
192 out.pop("path", None)
193 out.pop("protocol", None)
194 if "username" in out:
195 out["user"] = out.pop("username")
196 return out
197
198 def info(self, path):
199 out = self._call("GETFILESTATUS", path=path)
200 info = out.json()["FileStatus"]
201 info["name"] = path
202 return self._process_info(info)
203
204 def ls(self, path, detail=False):
205 out = self._call("LISTSTATUS", path=path)
206 infos = out.json()["FileStatuses"]["FileStatus"]
207 for info in infos:
208 self._process_info(info)
209 info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
210 if detail:
211 return sorted(infos, key=lambda i: i["name"])
212 else:
213 return sorted(info["name"] for info in infos)
214
215 def content_summary(self, path):
216 """Total numbers of files, directories and bytes under path"""
217 out = self._call("GETCONTENTSUMMARY", path=path)
218 return out.json()["ContentSummary"]
219
220 def ukey(self, path):
221 """Checksum info of file, giving method and result"""
222 out = self._call("GETFILECHECKSUM", path=path, redirect=False)
223 location = self._apply_proxy(out.headers["Location"])
224 out2 = self.session.get(location)
225 out2.raise_for_status()
226 return out2.json()["FileChecksum"]
227
228 def home_directory(self):
229 """Get user's home directory"""
230 out = self._call("GETHOMEDIRECTORY")
231 return out.json()["Path"]
232
233 def get_delegation_token(self, renewer=None):
234 """Retrieve token which can give the same authority to other uses
235
236 Parameters
237 ----------
238 renewer: str or None
239 User who may use this token; if None, will be current user
240 """
241 if renewer:
242 out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
243 else:
244 out = self._call("GETDELEGATIONTOKEN")
245 t = out.json()["Token"]
246 if t is None:
247 raise ValueError("No token available for this user/security context")
248 return t["urlString"]
249
250 def renew_delegation_token(self, token):
251 """Make token live longer. Returns new expiry time"""
252 out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
253 return out.json()["long"]
254
255 def cancel_delegation_token(self, token):
256 """Stop the token from being useful"""
257 self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
258
259 def chmod(self, path, mod):
260 """Set the permission at path
261
262 Parameters
263 ----------
264 path: str
265 location to set (file or directory)
266 mod: str or int
267 posix epresentation or permission, give as oct string, e.g, '777'
268 or 0o777
269 """
270 self._call("SETPERMISSION", method="put", path=path, permission=mod)
271
272 def chown(self, path, owner=None, group=None):
273 """Change owning user and/or group"""
274 kwargs = {}
275 if owner is not None:
276 kwargs["owner"] = owner
277 if group is not None:
278 kwargs["group"] = group
279 self._call("SETOWNER", method="put", path=path, **kwargs)
280
281 def set_replication(self, path, replication):
282 """
283 Set file replication factor
284
285 Parameters
286 ----------
287 path: str
288 File location (not for directories)
289 replication: int
290 Number of copies of file on the cluster. Should be smaller than
291 number of data nodes; normally 3 on most systems.
292 """
293 self._call("SETREPLICATION", path=path, method="put", replication=replication)
294
295 def mkdir(self, path, **kwargs):
296 self._call("MKDIRS", method="put", path=path)
297
298 def makedirs(self, path, exist_ok=False):
299 if exist_ok is False and self.exists(path):
300 raise FileExistsError(path)
301 self.mkdir(path)
302
303 def mv(self, path1, path2, **kwargs):
304 self._call("RENAME", method="put", path=path1, destination=path2)
305
306 def rm(self, path, recursive=False, **kwargs):
307 self._call(
308 "DELETE",
309 method="delete",
310 path=path,
311 recursive="true" if recursive else "false",
312 )
313
314 def _apply_proxy(self, location):
315 if self.proxy and callable(self.proxy):
316 location = self.proxy(location)
317 elif self.proxy:
318 # as a dict
319 for k, v in self.proxy.items():
320 location = location.replace(k, v, 1)
321 return location
322
323
324 class WebHDFile(AbstractBufferedFile):
325 """A file living in HDFS over webHDFS"""
326
327 def __init__(self, fs, path, **kwargs):
328 super().__init__(fs, path, **kwargs)
329 kwargs = kwargs.copy()
330 if kwargs.get("permissions", None) is None:
331 kwargs.pop("permissions", None)
332 if kwargs.get("replication", None) is None:
333 kwargs.pop("replication", None)
334 self.permissions = kwargs.pop("permissions", 511)
335 tempdir = kwargs.pop("tempdir")
336 if kwargs.pop("autocommit", False) is False:
337 self.target = self.path
338 self.path = "/".join([tempdir, str(uuid.uuid4())])
339
340 def _upload_chunk(self, final=False):
341 """ Write one part of a multi-block file upload
342
343 Parameters
344 ==========
345 final: bool
346 This is the last block, so should complete file, if
347 self.autocommit is True.
348 """
349 out = self.fs.session.post(self.location, data=self.buffer.getvalue())
350 out.raise_for_status()
351 return True
352
353 def _initiate_upload(self):
354 """ Create remote file/upload """
355 if "a" in self.mode:
356 op, method = "APPEND", "POST"
357 else:
358 op, method = "CREATE", "PUT"
359 if self.fs.exists(self.path):
360 # no "truncate" or "create empty"
361 self.fs.rm(self.path)
362 out = self.fs._call(op, method, self.path, redirect=False, **self.kwargs)
363 location = self.fs._apply_proxy(out.headers["Location"])
364 if "w" in self.mode:
365 # create empty file to append to
366 out2 = self.fs.session.put(location)
367 out2.raise_for_status()
368 self.location = location.replace("CREATE", "APPEND")
369
370 def _fetch_range(self, start, end):
371 out = self.fs._call(
372 "OPEN", path=self.path, offset=start, length=end - start, redirect=False
373 )
374 out.raise_for_status()
375 location = out.headers["Location"]
376 out2 = self.fs.session.get(self.fs._apply_proxy(location))
377 return out2.content
378
379 def commit(self):
380 self.fs.mv(self.path, self.target)
381
382 def discard(self):
383 self.fs.rm(self.path)
0 from __future__ import print_function, division, absolute_import
1
2 import zipfile
3 from fsspec import AbstractFileSystem, open_files
4 from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE
5
6
7 class ZipFileSystem(AbstractFileSystem):
8 """Read contents of ZIP archive as a file-system
9
10 Keeps file object open while instance lives.
11
12 This class is pickleable, but not necessarily thread-safe
13 """
14
15 root_marker = ""
16
17 def __init__(self, fo="", mode="r", **storage_options):
18 """
19 Parameters
20 ----------
21 fo: str or file-like
22 Contains ZIP, and must exist. If a str, will fetch file using
23 `open_files()`, which must return one file exactly.
24 mode: str
25 Currently, only 'r' accepted
26 storage_options: key-value
27 May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
28 other parameters for requests
29 """
30 if self._cached:
31 return
32 AbstractFileSystem.__init__(self)
33 if mode != "r":
34 raise ValueError("Only read from zip files accepted")
35 self.in_fo = fo
36 if isinstance(fo, str):
37 files = open_files(fo)
38 if len(files) != 1:
39 raise ValueError(
40 'Path "{}" did not resolve to exactly'
41 'one file: "{}"'.format(fo, files)
42 )
43 fo = files[0]
44 self.fo = fo.__enter__() # the whole instance is a context
45 self.zip = zipfile.ZipFile(self.fo)
46 self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE)
47 self.dir_cache = None
48
49 @classmethod
50 def _strip_protocol(cls, path):
51 # zip file paths are always relative to the archive root
52 return super()._strip_protocol(path).lstrip("/")
53
54 def _get_dirs(self):
55 if self.dir_cache is None:
56 files = self.zip.infolist()
57 self.dir_cache = {}
58 for z in files:
59 f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__}
60 f.update(
61 {
62 "name": z.filename,
63 "size": z.file_size,
64 "type": ("directory" if z.is_dir() else "file"),
65 }
66 )
67 self.dir_cache[f["name"]] = f
68
69 def ls(self, path, detail=False):
70 self._get_dirs()
71 paths = {}
72 for p, f in self.dir_cache.items():
73 p = p.rstrip("/")
74 if "/" in p:
75 root = p.rsplit("/", 1)[0]
76 else:
77 root = ""
78 if root == path.rstrip("/"):
79 paths[p] = f
80 elif path and all(
81 (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/"))
82 ):
83 # implicit directory
84 ppath = "/".join(p.split("/")[: len(path.split("/")) + 1])
85 if ppath not in paths:
86 out = {"name": ppath + "/", "size": 0, "type": "directory"}
87 paths[ppath] = out
88
89 elif all(
90 (a == b)
91 for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
92 ):
93 # root directory entry
94 ppath = p.rstrip("/").split("/", 1)[0]
95 if ppath not in paths:
96 out = {"name": ppath + "/", "size": 0, "type": "directory"}
97 paths[ppath] = out
98 out = list(paths.values())
99 if detail:
100 return out
101 else:
102 return list(sorted(f["name"] for f in out))
103
104 def cat(self, path):
105 return self.zip.read(path)
106
107 def _open(self, path, mode="rb", **kwargs):
108 path = self._strip_protocol(path)
109 if mode != "rb":
110 raise NotImplementedError
111 info = self.info(path)
112 out = self.zip.open(path, "r")
113 out.size = info["size"]
114 out.name = info["name"]
115 return out
116
117 def ukey(self, path):
118 return tokenize(path, self.in_fo, self.protocol)
0 from collections.abc import MutableMapping
1 from .registry import get_filesystem_class
2 from .core import split_protocol
3
4
5 class FSMap(MutableMapping):
6 """Wrap a FileSystem instance as a mutable wrapping.
7
8 The keys of the mapping become files under the given root, and the
9 values (which must be bytes) the contents of those files.
10
11 Parameters
12 ----------
13 root: string
14 prefix for all the files
15 fs: FileSystem instance
16 check: bool (=True)
17 performs a touch at the location, to check for write access.
18
19 Examples
20 --------
21 >>> fs = FileSystem(**parameters) # doctest: +SKIP
22 >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
23 or, more likely
24 >>> d = fs.get_mapper('my-data/path/')
25
26 >>> d['loc1'] = b'Hello World' # doctest: +SKIP
27 >>> list(d.keys()) # doctest: +SKIP
28 ['loc1']
29 >>> d['loc1'] # doctest: +SKIP
30 b'Hello World'
31 """
32
33 def __init__(self, root, fs, check=False, create=False):
34 self.fs = fs
35 self.root = fs._strip_protocol(root).rstrip(
36 "/"
37 ) # we join on '/' in _key_to_str
38 if create:
39 if not self.fs.exists(root):
40 self.fs.mkdir(root)
41 if check:
42 if not self.fs.exists(root):
43 raise ValueError(
44 "Path %s does not exist. Create "
45 " with the ``create=True`` keyword" % root
46 )
47 self.fs.touch(root + "/a")
48 self.fs.rm(root + "/a")
49
50 def clear(self):
51 """Remove all keys below root - empties out mapping
52 """
53 try:
54 self.fs.rm(self.root, True)
55 self.fs.mkdir(self.root)
56 except: # noqa: E722
57 pass
58
59 def _key_to_str(self, key):
60 """Generate full path for the key"""
61 if isinstance(key, (tuple, list)):
62 key = str(tuple(key))
63 else:
64 key = str(key)
65 return "/".join([self.root, key]) if self.root else key
66
67 def _str_to_key(self, s):
68 """Strip path of to leave key name"""
69 return s[len(self.root) :].lstrip("/")
70
71 def __getitem__(self, key, default=None):
72 """Retrieve data"""
73 key = self._key_to_str(key)
74 try:
75 result = self.fs.cat(key)
76 except: # noqa: E722
77 if default is not None:
78 return default
79 raise KeyError(key)
80 return result
81
82 def pop(self, key, default=None):
83 result = self.__getitem__(key, default)
84 try:
85 del self[key]
86 except KeyError:
87 pass
88 return result
89
90 def __setitem__(self, key, value):
91 """Store value in key"""
92 key = self._key_to_str(key)
93 self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
94 with self.fs.open(key, "wb") as f:
95 f.write(value)
96
97 def __iter__(self):
98 return (self._str_to_key(x) for x in self.fs.find(self.root))
99
100 def __len__(self):
101 return len(self.fs.find(self.root))
102
103 def __delitem__(self, key):
104 """Remove key"""
105 try:
106 self.fs.rm(self._key_to_str(key))
107 except: # noqa: E722
108 raise KeyError
109
110 def __contains__(self, key):
111 """Does key exist in mapping?"""
112 return self.fs.exists(self._key_to_str(key))
113
114 def __getstate__(self):
115 """Mapping should be pickleable"""
116 # TODO: replace with reduce to reinstantiate?
117 return self.fs, self.root
118
119 def __setstate__(self, state):
120 fs, root = state
121 self.fs = fs
122 self.root = root
123
124
125 def get_mapper(url, check=False, create=False, **kwargs):
126 """Create key-value interface for given URL and options
127
128 The URL will be of the form "protocol://location" and point to the root
129 of the mapper required. All keys will be file-names below this location,
130 and their values the contents of each key.
131
132 Parameters
133 ----------
134 url: str
135 Root URL of mapping
136 check: bool
137 Whether to attempt to read from the location before instantiation, to
138 check that the mapping does exist
139 create: bool
140 Whether to make the directory corresponding to the root before
141 instantiating
142
143 Returns
144 -------
145 ``FSMap`` instance, the dict-like key-value store.
146 """
147 protocol, path = split_protocol(url)
148 cls = get_filesystem_class(protocol)
149 fs = cls(**kwargs)
150 # Removing protocol here - could defer to each open() on the backend
151 return FSMap(url, fs, check, create)
0 import importlib
1 from distutils.version import LooseVersion
2
3 __all__ = ["registry", "get_filesystem_class", "default"]
4
5 # mapping protocol: implementation class object
6 registry = {}
7 default = "file"
8
9 # protocols mapped to the class which implements them. This dict can
10 # be dynamically updated.
11 known_implementations = {
12 "file": {"class": "fsspec.implementations.local.LocalFileSystem"},
13 "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
14 "http": {
15 "class": "fsspec.implementations.http.HTTPFileSystem",
16 "err": 'HTTPFileSystem requires "requests" to be installed',
17 },
18 "https": {
19 "class": "fsspec.implementations.http.HTTPFileSystem",
20 "err": 'HTTPFileSystem requires "requests" to be installed',
21 },
22 "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
23 "gcs": {
24 "class": "gcsfs.GCSFileSystem",
25 "err": "Please install gcsfs to access Google Storage",
26 },
27 "gs": {
28 "class": "gcsfs.GCSFileSystem",
29 "err": "Please install gcsfs to access Google Storage",
30 },
31 "sftp": {
32 "class": "fsspec.implementations.sftp.SFTPFileSystem",
33 "err": 'SFTPFileSystem requires "paramiko" to be installed',
34 },
35 "ssh": {
36 "class": "fsspec.implementations.sftp.SFTPFileSystem",
37 "err": 'SFTPFileSystem requires "paramiko" to be installed',
38 },
39 "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
40 "hdfs": {
41 "class": "fsspec.implementations.hdfs.PyArrowHDFS",
42 "err": "pyarrow and local java libraries required for HDFS",
43 },
44 "webhdfs": {
45 "class": "fsspec.implementations.webhdfs.WebHDFS",
46 "err": 'webHDFS access requires "requests" to be installed',
47 },
48 "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
49 "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
50 "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
51 "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
52 "dask": {
53 "class": "fsspec.implementations.dask.DaskWorkerFileSystem",
54 "err": "Install dask distributed to access worker file system",
55 },
56 }
57
58 minversions = {"s3fs": LooseVersion("0.3.0"), "gcsfs": LooseVersion("0.3.0")}
59
60
61 def get_filesystem_class(protocol):
62 """Fetch named protocol implementation from the registry
63
64 The dict ``known_implementations`` maps protocol names to the locations
65 of classes implementing the corresponding file-system. When used for the
66 first time, appropriate imports will happen and the class will be placed in
67 the registry. All subsequent calls will fetch directly from the registry.
68
69 Some protocol implementations require additional dependencies, and so the
70 import may fail. In this case, the string in the "err" field of the
71 ``known_implementations`` will be given as the error message.
72 """
73 if protocol is None:
74 protocol = default
75
76 if protocol not in registry:
77 if protocol not in known_implementations:
78 raise ValueError("Protocol not known: %s" % protocol)
79 bit = known_implementations[protocol]
80 mod, name = bit["class"].rsplit(".", 1)
81 minversion = minversions.get(mod, None)
82 err = None
83 try:
84 mod = importlib.import_module(mod)
85 except ImportError:
86 err = ImportError(bit["err"])
87
88 except Exception as e:
89 err = e
90 if err is not None:
91 raise RuntimeError(str(err))
92
93 if minversion:
94 version = getattr(mod, "__version__", None)
95 if version and LooseVersion(version) < minversion:
96 raise RuntimeError(
97 "'{}={}' is installed, but version '{}' or "
98 "higher is required".format(mod.__name__, version, minversion)
99 )
100 registry[protocol] = getattr(mod, name)
101 cls = registry[protocol]
102 if getattr(cls, "protocol", None) in ("abstract", None):
103 cls.protocol = protocol
104
105 return cls
106
107
108 def filesystem(protocol, **storage_options):
109 """Instantiate filesystems for given protocol and arguments
110
111 ``storage_options`` are specific to the protocol being chosen, and are
112 passed directly to the class.
113 """
114 cls = get_filesystem_class(protocol)
115 return cls(**storage_options)
0 import warnings
1 from hashlib import md5
2 import io
3 import os
4 import logging
5
6 from .transaction import Transaction
7 from .utils import read_block, tokenize, stringify_path
8
9 logger = logging.getLogger("fsspec")
10
11
12 def make_instance(cls, args, kwargs):
13 return cls(*args, **kwargs)
14
15
16 class _Cached(type):
17 """
18 Metaclass for caching file system instances.
19
20 Notes
21 -----
22 Instances are cached according to
23
24 * The values of the class attributes listed in `_extra_tokenize_attributes`
25 * The arguments passed to ``__init__``.
26
27 This creates an additional reference to the filesystem, which prevents the
28 filesystem from being garbage collected when all *user* references go away.
29 A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
30 be made for a filesystem instance to be garbage collected.
31 """
32
33 cachable = True
34 _extra_tokenize_attributes = ()
35
36 def __init__(self, *args, **kwargs):
37 super().__init__(*args, **kwargs)
38 # Note: we intentionally create a reference here, to avoid garbage
39 # collecting instances when all other references are gone. To really
40 # delete a FileSystem, the cache must be cleared.
41 self._cache = {}
42
43 def __call__(self, *args, **kwargs):
44 cls = type(self)
45 extra_tokens = tuple(
46 getattr(self, attr, None) for attr in self._extra_tokenize_attributes
47 )
48 token = tokenize(cls, *args, *extra_tokens, **kwargs)
49 if self.cachable and token in self._cache:
50 return self._cache[token]
51 else:
52 obj = super().__call__(*args, **kwargs)
53 # Setting _fs_token here causes some static linters to complain.
54 obj._fs_token_ = token
55 self.storage_args = args
56 self.storage_options = kwargs
57
58 if self.cachable:
59 self._cache[token] = obj
60 return obj
61
62
63 try: # optionally derive from pyarrow's FileSystem, if available
64 import pyarrow as pa
65
66 up = pa.filesystem.DaskFileSystem
67 except ImportError:
68 up = object
69
70
71 class AbstractFileSystem(up, metaclass=_Cached):
72 """
73 An abstract super-class for pythonic file-systems
74
75 Implementations are expected to be compatible with or, better, subclass
76 from here.
77 """
78
79 cachable = True # this class can be cached, instances reused
80 _cached = False
81 blocksize = 2 ** 22
82 sep = "/"
83 protocol = "abstract"
84 root_marker = "" # For some FSs, may require leading '/' or other character
85
86 #: Extra *class attributes* that should be considered when hashing.
87 _extra_tokenize_attributes = ()
88
89 def __init__(self, *args, **storage_options):
90 """Create and configure file-system instance
91
92 Instances may be cachable, so if similar enough arguments are seen
93 a new instance is not required. The token attribute exists to allow
94 implementations to cache instances if they wish.
95
96 A reasonable default should be provided if there are no arguments.
97
98 Subclasses should call this method.
99
100 Magic kwargs that affect functionality here:
101 add_docs: if True, will append docstrings from this spec to the
102 specific implementation
103 """
104 if self._cached:
105 # reusing instance, don't change
106 return
107 self._cached = True
108 self._intrans = False
109 self._transaction = None
110 self.dircache = {}
111
112 if storage_options.pop("add_docs", None):
113 warnings.warn("add_docs is no longer supported.", FutureWarning)
114
115 if storage_options.pop("add_aliases", None):
116 warnings.warn("add_aliases has been removed.", FutureWarning)
117 # This is set in _Cached
118 self._fs_token_ = None
119
120 @property
121 def _fs_token(self):
122 return self._fs_token_
123
124 def __dask_tokenize__(self):
125 return self._fs_token
126
127 def __hash__(self):
128 return int(self._fs_token, 16)
129
130 def __eq__(self, other):
131 return isinstance(other, type(self)) and self._fs_token == other._fs_token
132
133 @classmethod
134 def _strip_protocol(cls, path):
135 """ Turn path from fully-qualified to file-system-specific
136
137 May require FS-specific handling, e.g., for relative paths or links.
138 """
139 path = stringify_path(path)
140 protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
141 for protocol in protos:
142 path = path.rstrip("/")
143 if path.startswith(protocol + "://"):
144 path = path[len(protocol) + 3 :]
145 elif path.startswith(protocol + ":"):
146 path = path[len(protocol) + 1 :]
147 # use of root_marker to make minimum required path, e.g., "/"
148 return path or cls.root_marker
149
150 @staticmethod
151 def _get_kwargs_from_urls(paths):
152 """If kwargs can be encoded in the paths, extract them here
153
154 This should happen before instantiation of the class; incoming paths
155 then should be amended to strip the options in methods.
156
157 Examples may look like an sftp path "sftp://user@host:/my/path", where
158 the user and host should become kwargs and later get stripped.
159 """
160 # by default, nothing happens
161 return {}
162
163 @classmethod
164 def current(cls):
165 """ Return the most recently created FileSystem
166
167 If no instance has been created, then create one with defaults
168 """
169 if not len(cls._cache):
170 return cls()
171 else:
172 return list(cls._cache.values())[-1]
173
174 @property
175 def transaction(self):
176 """A context within which files are committed together upon exit
177
178 Requires the file class to implement `.commit()` and `.discard()`
179 for the normal and exception cases.
180 """
181 if self._transaction is None:
182 self._transaction = Transaction(self)
183 return self._transaction
184
185 def start_transaction(self):
186 """Begin write transaction for deferring files, non-context version"""
187 self._intrans = True
188 self._transaction = Transaction(self)
189 return self.transaction
190
191 def end_transaction(self):
192 """Finish write transaction, non-context version"""
193 self.transaction.complete()
194 self._transaction = None
195
196 def invalidate_cache(self, path=None):
197 """
198 Discard any cached directory information
199
200 Parameters
201 ----------
202 path: string or None
203 If None, clear all listings cached else listings at or under given
204 path.
205 """
206 pass # not necessary to implement, may have no cache
207
208 def mkdir(self, path, create_parents=True, **kwargs):
209 """
210 Create directory entry at path
211
212 For systems that don't have true directories, may create an for
213 this instance only and not touch the real filesystem
214
215 Parameters
216 ----------
217 path: str
218 location
219 create_parents: bool
220 if True, this is equivalent to ``makedirs``
221 kwargs:
222 may be permissions, etc.
223 """
224 pass # not necessary to implement, may not have directories
225
226 def makedirs(self, path, exist_ok=False):
227 """Recursively make directories
228
229 Creates directory at path and any intervening required directories.
230 Raises exception if, for instance, the path already exists but is a
231 file.
232
233 Parameters
234 ----------
235 path: str
236 leaf directory name
237 exist_ok: bool (False)
238 If True, will error if the target already exists
239 """
240 pass # not necessary to implement, may not have directories
241
242 def rmdir(self, path):
243 """Remove a directory, if empty"""
244 pass # not necessary to implement, may not have directories
245
246 def ls(self, path, detail=True, **kwargs):
247 """List objects at path.
248
249 This should include subdirectories and files at that location. The
250 difference between a file and a directory must be clear when details
251 are requested.
252
253 The specific keys, or perhaps a FileInfo class, or similar, is TBD,
254 but must be consistent across implementations.
255 Must include:
256 - full path to the entry (without protocol)
257 - size of the entry, in bytes. If the value cannot be determined, will
258 be ``None``.
259 - type of entry, "file", "directory" or other
260
261 Additional information
262 may be present, aproriate to the file-system, e.g., generation,
263 checksum, etc.
264
265 May use refresh=True|False to allow use of self._ls_from_cache to
266 check for a saved listing and avoid calling the backend. This would be
267 common where listing may be expensive.
268
269 Parameters
270 ----------
271 path: str
272 detail: bool
273 if True, gives a list of dictionaries, where each is the same as
274 the result of ``info(path)``. If False, gives a list of paths
275 (str).
276 kwargs: may have additional backend-specific options, such as version
277 information
278
279 Returns
280 -------
281 List of strings if detail is False, or list of directory information
282 dicts if detail is True.
283 """
284 raise NotImplementedError
285
286 def _ls_from_cache(self, path):
287 """Check cache for listing
288
289 Returns listing, if found (may me empty list for a directly that exists
290 but contains nothing), None if not in cache.
291 """
292 parent = self._parent(path)
293 if path in self.dircache:
294 return self.dircache[path]
295 elif parent in self.dircache:
296 files = [f for f in self.dircache[parent] if f["name"] == path]
297 if len(files) == 0:
298 # parent dir was listed but did not contain this file
299 raise FileNotFoundError(path)
300 return files
301
302 def walk(self, path, maxdepth=None, **kwargs):
303 """ Return all files belows path
304
305 List all files, recursing into subdirectories; output is iterator-style,
306 like ``os.walk()``. For a simple list of files, ``find()`` is available.
307
308 Note that the "files" outputted will include anything that is not
309 a directory, such as links.
310
311 Parameters
312 ----------
313 path: str
314 Root to recurse into
315 maxdepth: int
316 Maximum recursion depth. None means limitless, but not recommended
317 on link-based file-systems.
318 kwargs: passed to ``ls``
319 """
320 path = self._strip_protocol(path)
321 full_dirs = []
322 dirs = []
323 files = []
324
325 try:
326 listing = self.ls(path, detail=True, **kwargs)
327 except (FileNotFoundError, IOError):
328 return [], [], []
329
330 for info in listing:
331 # each info name must be at least [path]/part , but here
332 # we check also for names like [path]/part/
333 name = info["name"].rstrip("/")
334 if info["type"] == "directory" and name != path:
335 # do not include "self" path
336 full_dirs.append(name)
337 dirs.append(name.rsplit("/", 1)[-1])
338 elif name == path:
339 # file-like with same name as give path
340 files.append("")
341 else:
342 files.append(name.rsplit("/", 1)[-1])
343 yield path, dirs, files
344
345 for d in full_dirs:
346 if maxdepth is None or maxdepth > 1:
347 for res in self.walk(
348 d,
349 maxdepth=(maxdepth - 1) if maxdepth is not None else None,
350 **kwargs
351 ):
352 yield res
353
354 def find(self, path, maxdepth=None, withdirs=False, **kwargs):
355 """List all files below path.
356
357 Like posix ``find`` command without conditions
358
359 Parameters
360 ----------
361 path : str
362 maxdepth: int or None
363 If not None, the maximum number of levels to descend
364 withdirs: bool
365 Whether to include directory paths in the output. This is True
366 when used by glob, but users usually only want files.
367 kwargs are passed to ``ls``.
368 """
369 # TODO: allow equivalent of -name parameter
370 out = set()
371 for path, dirs, files in self.walk(path, maxdepth, **kwargs):
372 if withdirs:
373 files += dirs
374 for name in files:
375 if name and name not in out:
376 out.add("/".join([path.rstrip("/"), name]) if path else name)
377 if self.isfile(path) and path not in out:
378 # walk works on directories, but find should also return [path]
379 # when path happens to be a file
380 out.add(path)
381 return sorted(out)
382
383 def du(self, path, total=True, maxdepth=None, **kwargs):
384 """Space used by files within a path
385
386 Parameters
387 ----------
388 path: str
389 total: bool
390 whether to sum all the file sizes
391 maxdepth: int or None
392 maximum number of directory levels to descend, None for unlimited.
393 kwargs: passed to ``ls``
394
395 Returns
396 -------
397 Dict of {fn: size} if total=False, or int otherwise, where numbers
398 refer to bytes used.
399 """
400 sizes = {}
401 for f in self.find(path, maxdepth=maxdepth, **kwargs):
402 info = self.info(f)
403 sizes[info["name"]] = info["size"]
404 if total:
405 return sum(sizes.values())
406 else:
407 return sizes
408
409 def glob(self, path, **kwargs):
410 """
411 Find files by glob-matching.
412
413 If the path ends with '/' and does not contain "*", it is essentially
414 the same as ``ls(path)``, returning only files.
415
416 We support ``"**"``,
417 ``"?"`` and ``"[..]"``.
418
419 kwargs are passed to ``ls``.
420 """
421 import re
422 from glob import has_magic
423
424 ends = path.endswith("/")
425 path = self._strip_protocol(path)
426 indstar = path.find("*") if path.find("*") >= 0 else len(path)
427 indques = path.find("?") if path.find("?") >= 0 else len(path)
428 indbrace = path.find("[") if path.find("[") >= 0 else len(path)
429
430 ind = min(indstar, indques, indbrace)
431
432 if not has_magic(path):
433 root = path
434 depth = 1
435 if ends:
436 path += "/*"
437 elif self.exists(path):
438 return [path]
439 else:
440 return [] # glob of non-existent returns empty
441 elif "/" in path[:ind]:
442 ind2 = path[:ind].rindex("/")
443 root = path[: ind2 + 1]
444 depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1
445 else:
446 root = ""
447 depth = 20 if "**" in path else 1
448 allpaths = self.find(root, maxdepth=depth, withdirs=True, **kwargs)
449 pattern = (
450 "^"
451 + (
452 path.replace("\\", r"\\")
453 .replace(".", r"\.")
454 .replace("+", r"\+")
455 .replace("//", "/")
456 .replace("(", r"\(")
457 .replace(")", r"\)")
458 .replace("|", r"\|")
459 .rstrip("/")
460 .replace("?", ".")
461 )
462 + "$"
463 )
464 pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
465 pattern = re.sub("[*]", "[^/]*", pattern)
466 pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
467 out = {p for p in allpaths if pattern.match(p.replace("//", "/").rstrip("/"))}
468 return list(sorted(out))
469
470 def exists(self, path):
471 """Is there a file at the given path"""
472 try:
473 self.info(path)
474 return True
475 except: # noqa: E722
476 # any exception allowed bar FileNotFoundError?
477 return False
478
479 def info(self, path, **kwargs):
480 """Give details of entry at path
481
482 Returns a single dictionary, with exactly the same information as ``ls``
483 would with ``detail=True``.
484
485 The default implementation should calls ls and could be overridden by a
486 shortcut. kwargs are passed on to ```ls()``.
487
488 Some file systems might not be able to measure the file's size, in
489 which case, the returned dict will include ``'size': None``.
490
491 Returns
492 -------
493 dict with keys: name (full path in the FS), size (in bytes), type (file,
494 directory, or something else) and other FS-specific keys.
495 """
496 path = self._strip_protocol(path)
497 out = self.ls(self._parent(path), detail=True, **kwargs)
498 out = [o for o in out if o["name"].rstrip("/") == path]
499 if out:
500 return out[0]
501 out = self.ls(path, detail=True, **kwargs)
502 path = path.rstrip("/")
503 out1 = [o for o in out if o["name"].rstrip("/") == path]
504 if len(out1) == 1:
505 if "size" not in out1[0]:
506 out1[0]["size"] = None
507 return out1[0]
508 elif len(out1) > 1 or out:
509 return {"name": path, "size": 0, "type": "directory"}
510 else:
511 raise FileNotFoundError(path)
512
513 def checksum(self, path):
514 """Unique value for current version of file
515
516 If the checksum is the same from one moment to another, the contents
517 are guaranteed to be the same. If the checksum changes, the contents
518 *might* have changed.
519
520 This should normally be overridden; default will probably capture
521 creation/modification timestamp (which would be good) or maybe
522 access timestamp (which would be bad)
523 """
524 return int(tokenize(self.info(path)), 16)
525
526 def size(self, path):
527 """Size in bytes of file"""
528 return self.info(path).get("size", None)
529
530 def isdir(self, path):
531 """Is this entry directory-like?"""
532 try:
533 return self.info(path)["type"] == "directory"
534 except FileNotFoundError:
535 return False
536
537 def isfile(self, path):
538 """Is this entry file-like?"""
539 try:
540 return self.info(path)["type"] == "file"
541 except: # noqa: E722
542 return False
543
544 def cat(self, path):
545 """ Get the content of a file """
546 return self.open(path, "rb").read()
547
548 def get(self, rpath, lpath, recursive=False, **kwargs):
549 """Copy file to local.
550
551 Possible extension: maybe should be able to copy to any file-system
552 (streaming through local).
553 """
554 rpath = self._strip_protocol(rpath)
555 if recursive:
556 rpaths = self.find(rpath)
557 lpaths = [
558 os.path.join(lpath, path[len(rpath) :].lstrip("/")) for path in rpaths
559 ]
560 for lpath in lpaths:
561 dirname = os.path.dirname(lpath)
562 if not os.path.isdir(dirname):
563 os.makedirs(dirname)
564 else:
565 rpaths = [rpath]
566 lpaths = [lpath]
567 for lpath, rpath in zip(lpaths, rpaths):
568 with self.open(rpath, "rb", **kwargs) as f1:
569 with open(lpath, "wb") as f2:
570 data = True
571 while data:
572 data = f1.read(self.blocksize)
573 f2.write(data)
574
575 def put(self, lpath, rpath, recursive=False, **kwargs):
576 """ Upload file from local """
577 if recursive:
578 lpaths = []
579 for dirname, subdirlist, filelist in os.walk(lpath):
580 lpaths += [os.path.join(dirname, filename) for filename in filelist]
581 rootdir = os.path.basename(lpath.rstrip("/"))
582 if self.exists(rpath):
583 # copy lpath inside rpath directory
584 rpath2 = os.path.join(rpath, rootdir)
585 else:
586 # copy lpath as rpath directory
587 rpath2 = rpath
588 rpaths = [
589 os.path.join(rpath2, path[len(lpath) :].lstrip("/")) for path in lpaths
590 ]
591 else:
592 lpaths = [lpath]
593 rpaths = [rpath]
594 for lpath, rpath in zip(lpaths, rpaths):
595 with open(lpath, "rb") as f1:
596 with self.open(rpath, "wb", **kwargs) as f2:
597 data = True
598 while data:
599 data = f1.read(self.blocksize)
600 f2.write(data)
601
602 def head(self, path, size=1024):
603 """ Get the first ``size`` bytes from file """
604 with self.open(path, "rb") as f:
605 return f.read(size)
606
607 def tail(self, path, size=1024):
608 """ Get the last ``size`` bytes from file """
609 with self.open(path, "rb") as f:
610 f.seek(max(-size, -f.size), 2)
611 return f.read()
612
613 def copy(self, path1, path2, **kwargs):
614 """ Copy within two locations in the filesystem"""
615 raise NotImplementedError
616
617 def mv(self, path1, path2, **kwargs):
618 """ Move file from one location to another """
619 self.copy(path1, path2, **kwargs)
620 self.rm(path1, recursive=False)
621
622 def _rm(self, path):
623 """Delete a file"""
624 raise NotImplementedError
625
626 def rm(self, path, recursive=False, maxdepth=None):
627 """Delete files.
628
629 Parameters
630 ----------
631 path: str or list of str
632 File(s) to delete.
633 recursive: bool
634 If file(s) are directories, recursively delete contents and then
635 also remove the directory
636 maxdepth: int or None
637 Depth to pass to walk for finding files to delete, if recursive.
638 If None, there will be no limit and infinite recursion may be
639 possible.
640 """
641 # prefer some bulk method, if possible
642 if not isinstance(path, list):
643 path = [path]
644 for p in path:
645 if recursive:
646 out = self.walk(p, maxdepth=maxdepth)
647 for pa_, _, files in reversed(list(out)):
648 for name in files:
649 fn = "/".join([pa_, name]) if pa_ else name
650 self.rm(fn)
651 self.rmdir(pa_)
652 else:
653 self._rm(p)
654
655 @classmethod
656 def _parent(cls, path):
657 path = cls._strip_protocol(path.rstrip("/"))
658 if "/" in path:
659 return cls.root_marker + path.rsplit("/", 1)[0]
660 else:
661 return cls.root_marker
662
663 def _open(
664 self,
665 path,
666 mode="rb",
667 block_size=None,
668 autocommit=True,
669 cache_options=None,
670 **kwargs
671 ):
672 """Return raw bytes-mode file-like from the file-system"""
673 return AbstractBufferedFile(
674 self,
675 path,
676 mode,
677 block_size,
678 autocommit,
679 cache_options=cache_options,
680 **kwargs
681 )
682
683 def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
684 """
685 Return a file-like object from the filesystem
686
687 The resultant instance must function correctly in a context ``with``
688 block.
689
690 Parameters
691 ----------
692 path: str
693 Target file
694 mode: str like 'rb', 'w'
695 See builtin ``open()``
696 block_size: int
697 Some indication of buffering - this is a value in bytes
698 cache_options : dict, optional
699 Extra arguments to pass through to the cache.
700 encoding, errors, newline: passed on to TextIOWrapper for text mode
701 """
702 import io
703
704 path = self._strip_protocol(path)
705 if "b" not in mode:
706 mode = mode.replace("t", "") + "b"
707
708 text_kwargs = {
709 k: kwargs.pop(k)
710 for k in ["encoding", "errors", "newline"]
711 if k in kwargs
712 }
713 return io.TextIOWrapper(
714 self.open(path, mode, block_size, **kwargs), **text_kwargs
715 )
716 else:
717 ac = kwargs.pop("autocommit", not self._intrans)
718 f = self._open(
719 path,
720 mode=mode,
721 block_size=block_size,
722 autocommit=ac,
723 cache_options=cache_options,
724 **kwargs
725 )
726 if not ac:
727 self.transaction.files.append(f)
728 return f
729
730 def touch(self, path, truncate=True, **kwargs):
731 """ Create empty file, or update timestamp
732
733 Parameters
734 ----------
735 path: str
736 file location
737 truncate: bool
738 If True, always set file size to 0; if False, update timestamp and
739 leave file unchanged, if backend allows this
740 """
741 if truncate or not self.exists(path):
742 with self.open(path, "wb", **kwargs):
743 pass
744 else:
745 raise NotImplementedError # update timestamp, if possible
746
747 def ukey(self, path):
748 """Hash of file properties, to tell if it has changed"""
749 return md5(str(self.info(path)).encode()).hexdigest()
750
751 def read_block(self, fn, offset, length, delimiter=None):
752 """ Read a block of bytes from
753
754 Starting at ``offset`` of the file, read ``length`` bytes. If
755 ``delimiter`` is set then we ensure that the read starts and stops at
756 delimiter boundaries that follow the locations ``offset`` and ``offset
757 + length``. If ``offset`` is zero then we start at zero. The
758 bytestring returned WILL include the end delimiter string.
759
760 If offset+length is beyond the eof, reads to eof.
761
762 Parameters
763 ----------
764 fn: string
765 Path to filename
766 offset: int
767 Byte offset to start read
768 length: int
769 Number of bytes to read
770 delimiter: bytes (optional)
771 Ensure reading starts and stops at delimiter bytestring
772
773 Examples
774 --------
775 >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
776 b'Alice, 100\\nBo'
777 >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
778 b'Alice, 100\\nBob, 200\\n'
779
780 Use ``length=None`` to read to the end of the file.
781 >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
782 b'Alice, 100\\nBob, 200\\nCharlie, 300'
783
784 See Also
785 --------
786 utils.read_block
787 """
788 with self.open(fn, "rb") as f:
789 size = f.size
790 if length is None:
791 length = size
792 if size is not None and offset + length > size:
793 length = size - offset
794 return read_block(f, offset, length, delimiter)
795
796 def __reduce__(self):
797 return make_instance, (type(self), self.storage_args, self.storage_options)
798
799 def _get_pyarrow_filesystem(self):
800 """
801 Make a version of the FS instance which will be acceptable to pyarrow
802 """
803 # all instances already also derive from pyarrow
804 return self
805
806 def get_mapper(self, root, check=False, create=False):
807 """Create key/value store based on this file-system
808
809 Makes a MutibleMapping interface to the FS at the given root path.
810 See ``fsspec.mapping.FSMap`` for further details.
811 """
812 from .mapping import FSMap
813
814 return FSMap(root, self, check, create)
815
816 @classmethod
817 def clear_instance_cache(cls):
818 """
819 Clear the cache of filesystem instances.
820
821 Notes
822 -----
823 Unless overridden by setting the ``cachable`` class attribute to False,
824 the filesystem class stores a reference to newly created instances. This
825 prevents Python's normal rules around garbage collection from working,
826 since the instances refcount will not drop to zero until
827 ``clear_instance_cache`` is called.
828 """
829 cls._cache.clear()
830
831 # ------------------------------------------------------------------------
832 # Aliases
833
834 def makedir(self, path, create_parents=True, **kwargs):
835 """Alias of :ref:`FilesystemSpec.mkdir`."""
836 return self.mkdir(path, create_parents=create_parents, **kwargs)
837
838 def mkdirs(self, path, exist_ok=False):
839 """Alias of :ref:`FilesystemSpec.makedirs`."""
840 return self.makedirs(path, exist_ok=exist_ok)
841
842 def listdir(self, path, detail=True, **kwargs):
843 """Alias of :ref:`FilesystemSpec.ls`."""
844 return self.ls(path, detail=detail, **kwargs)
845
846 def cp(self, path1, path2, **kwargs):
847 """Alias of :ref:`FilesystemSpec.copy`."""
848 return self.copy(path1, path2, **kwargs)
849
850 def move(self, path1, path2, **kwargs):
851 """Alias of :ref:`FilesystemSpec.mv`."""
852 return self.mv(path1, path2, **kwargs)
853
854 def stat(self, path, **kwargs):
855 """Alias of :ref:`FilesystemSpec.info`."""
856 return self.info(path, **kwargs)
857
858 def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
859 """Alias of :ref:`FilesystemSpec.du`."""
860 return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
861
862 def rename(self, path1, path2, **kwargs):
863 """Alias of :ref:`FilesystemSpec.mv`."""
864 return self.mv(path1, path2, **kwargs)
865
866 def delete(self, path, recursive=False, maxdepth=None):
867 """Alias of :ref:`FilesystemSpec.rm`."""
868 return self.rm(path, recursive=recursive, maxdepth=maxdepth)
869
870 def upload(self, lpath, rpath, recursive=False, **kwargs):
871 """Alias of :ref:`FilesystemSpec.put`."""
872 return self.put(lpath, rpath, recursive=recursive, **kwargs)
873
874 def download(self, rpath, lpath, recursive=False, **kwargs):
875 """Alias of :ref:`FilesystemSpec.get`."""
876 return self.get(rpath, lpath, recursive=recursive, **kwargs)
877
878
879 class AbstractBufferedFile(io.IOBase):
880 """Convenient class to derive from to provide buffering
881
882 In the case that the backend does not provide a pythonic file-like object
883 already, this class contains much of the logic to build one. The only
884 methods that need to be overridden are ``_upload_chunk``,
885 ``_initate_upload`` and ``_fetch_range``.
886 """
887
888 DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
889
890 def __init__(
891 self,
892 fs,
893 path,
894 mode="rb",
895 block_size="default",
896 autocommit=True,
897 cache_type="readahead",
898 cache_options=None,
899 **kwargs
900 ):
901 """
902 Template for files with buffered reading and writing
903
904 Parameters
905 ----------
906 fs: instance of FileSystem
907 path: str
908 location in file-system
909 mode: str
910 Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
911 systems may be read-only, and some may not support append.
912 block_size: int
913 Buffer size for reading or writing, 'default' for class default
914 autocommit: bool
915 Whether to write to final destination; may only impact what
916 happens when file is being closed.
917 cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
918 Caching policy in read mode. See the definitions in ``core``.
919 cache_options : dict
920 Additional options passed to the constructor for the cache specified
921 by `cache_type`.
922 kwargs:
923 Gets stored as self.kwargs
924 """
925 from .core import caches
926
927 self.path = path
928 self.fs = fs
929 self.mode = mode
930 self.blocksize = (
931 self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
932 )
933 self.loc = 0
934 self.autocommit = autocommit
935 self.end = None
936 self.start = None
937 self.closed = False
938
939 if cache_options is None:
940 cache_options = {}
941
942 if "trim" in kwargs:
943 warnings.warn(
944 "Passing 'trim' to control the cache behavior has been deprecated. "
945 "Specify it within the 'cache_options' argument instead.",
946 FutureWarning,
947 )
948 cache_options["trim"] = kwargs.pop("trim")
949
950 self.kwargs = kwargs
951
952 if mode not in {"ab", "rb", "wb"}:
953 raise NotImplementedError("File mode not supported")
954 if mode == "rb":
955 if not hasattr(self, "details"):
956 self.details = fs.info(path)
957 self.size = self.details["size"]
958 self.cache = caches[cache_type](
959 self.blocksize, self._fetch_range, self.size, **cache_options
960 )
961 else:
962 self.buffer = io.BytesIO()
963 self.offset = None
964 self.forced = False
965 self.location = None
966
967 @property
968 def closed(self):
969 # get around this attr being read-only in IOBase
970 return self._closed
971
972 @closed.setter
973 def closed(self, c):
974 self._closed = c
975
976 def __hash__(self):
977 if "w" in self.mode:
978 return id(self)
979 else:
980 return int(tokenize(self.details), 16)
981
982 def __eq__(self, other):
983 """Files are equal if they have the same checksum, only in read mode"""
984 return self.mode == "rb" and other.mode == "rb" and hash(self) == hash(other)
985
986 def commit(self):
987 """Move from temp to final destination"""
988
989 def discard(self):
990 """Throw away temporary file"""
991
992 def info(self):
993 """ File information about this path """
994 if "r" in self.mode:
995 return self.details
996 else:
997 raise ValueError("Info not available while writing")
998
999 def tell(self):
1000 """ Current file location """
1001 return self.loc
1002
1003 def seek(self, loc, whence=0):
1004 """ Set current file location
1005
1006 Parameters
1007 ----------
1008 loc: int
1009 byte location
1010 whence: {0, 1, 2}
1011 from start of file, current location or end of file, resp.
1012 """
1013 loc = int(loc)
1014 if not self.mode == "rb":
1015 raise ValueError("Seek only available in read mode")
1016 if whence == 0:
1017 nloc = loc
1018 elif whence == 1:
1019 nloc = self.loc + loc
1020 elif whence == 2:
1021 nloc = self.size + loc
1022 else:
1023 raise ValueError("invalid whence (%s, should be 0, 1 or 2)" % whence)
1024 if nloc < 0:
1025 raise ValueError("Seek before start of file")
1026 self.loc = nloc
1027 return self.loc
1028
1029 def write(self, data):
1030 """
1031 Write data to buffer.
1032
1033 Buffer only sent on flush() or if buffer is greater than
1034 or equal to blocksize.
1035
1036 Parameters
1037 ----------
1038 data: bytes
1039 Set of bytes to be written.
1040 """
1041 if self.mode not in {"wb", "ab"}:
1042 raise ValueError("File not in write mode")
1043 if self.closed:
1044 raise ValueError("I/O operation on closed file.")
1045 if self.forced:
1046 raise ValueError("This file has been force-flushed, can only close")
1047 out = self.buffer.write(data)
1048 self.loc += out
1049 if self.buffer.tell() >= self.blocksize:
1050 self.flush()
1051 return out
1052
1053 def flush(self, force=False):
1054 """
1055 Write buffered data to backend store.
1056
1057 Writes the current buffer, if it is larger than the block-size, or if
1058 the file is being closed.
1059
1060 Parameters
1061 ----------
1062 force: bool
1063 When closing, write the last block even if it is smaller than
1064 blocks are allowed to be. Disallows further writing to this file.
1065 """
1066
1067 if self.closed:
1068 raise ValueError("Flush on closed file")
1069 if force and self.forced:
1070 raise ValueError("Force flush cannot be called more than once")
1071 if force:
1072 self.forced = True
1073
1074 if self.mode not in {"wb", "ab"}:
1075 # no-op to flush on read-mode
1076 return
1077
1078 if not force and self.buffer.tell() < self.blocksize:
1079 # Defer write on small block
1080 return
1081
1082 if self.offset is None:
1083 # Initialize a multipart upload
1084 self.offset = 0
1085 self._initiate_upload()
1086
1087 if self._upload_chunk(final=force) is not False:
1088 self.offset += self.buffer.seek(0, 2)
1089 self.buffer = io.BytesIO()
1090
1091 def _upload_chunk(self, final=False):
1092 """ Write one part of a multi-block file upload
1093
1094 Parameters
1095 ==========
1096 final: bool
1097 This is the last block, so should complete file, if
1098 self.autocommit is True.
1099 """
1100 # may not yet have been initialized, may neet to call _initialize_upload
1101
1102 def _initiate_upload(self):
1103 """ Create remote file/upload """
1104 pass
1105
1106 def _fetch_range(self, start, end):
1107 """Get the specified set of bytes from remote"""
1108 raise NotImplementedError
1109
1110 def read(self, length=-1):
1111 """
1112 Return data from cache, or fetch pieces as necessary
1113
1114 Parameters
1115 ----------
1116 length: int (-1)
1117 Number of bytes to read; if <0, all remaining bytes.
1118 """
1119 length = -1 if length is None else int(length)
1120 if self.mode != "rb":
1121 raise ValueError("File not in read mode")
1122 if length < 0:
1123 length = self.size - self.loc
1124 if self.closed:
1125 raise ValueError("I/O operation on closed file.")
1126 logger.debug("%s read: %i - %i" % (self, self.loc, self.loc + length))
1127 if length == 0:
1128 # don't even bother calling fetch
1129 return b""
1130 out = self.cache._fetch(self.loc, self.loc + length)
1131 self.loc += len(out)
1132 return out
1133
1134 def readinto(self, b):
1135 """mirrors builtin file's readinto method
1136
1137 https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
1138 """
1139 data = self.read(len(b))
1140 b[: len(data)] = data
1141 return len(data)
1142
1143 def readuntil(self, char=b"\n", blocks=None):
1144 """Return data between current position and first occurrence of char
1145
1146 char is included in the output, except if the end of the tile is
1147 encountered first.
1148
1149 Parameters
1150 ----------
1151 char: bytes
1152 Thing to find
1153 blocks: None or int
1154 How much to read in each go. Defaults to file blocksize - which may
1155 mean a new read on every call.
1156 """
1157 out = []
1158 while True:
1159 start = self.tell()
1160 part = self.read(blocks or self.blocksize)
1161 if len(part) == 0:
1162 break
1163 found = part.find(char)
1164 if found > -1:
1165 out.append(part[: found + len(char)])
1166 self.seek(start + found + len(char))
1167 break
1168 out.append(part)
1169 return b"".join(out)
1170
1171 def readline(self):
1172 """Read until first occurrence of newline character
1173
1174 Note that, because of character encoding, this is not necessarily a
1175 true line ending.
1176 """
1177 return self.readuntil(b"\n")
1178
1179 def __next__(self):
1180 out = self.readline()
1181 if out:
1182 return out
1183 raise StopIteration
1184
1185 def __iter__(self):
1186 return self
1187
1188 def readlines(self):
1189 """Return all data, split by the newline character"""
1190 data = self.read()
1191 lines = data.split(b"\n")
1192 out = [l + b"\n" for l in lines[:-1]]
1193 if data.endswith(b"\n"):
1194 return out
1195 else:
1196 return out + [lines[-1]]
1197 # return list(self) ???
1198
1199 def readinto1(self, b):
1200 return self.readinto(b)
1201
1202 def close(self):
1203 """ Close file
1204
1205 Finalizes writes, discards cache
1206 """
1207 if self.closed:
1208 return
1209 if self.mode == "rb":
1210 self.cache = None
1211 else:
1212 if not self.forced:
1213 self.flush(force=True)
1214
1215 if self.fs is not None:
1216 self.fs.invalidate_cache(self.path)
1217 self.fs.invalidate_cache(self.fs._parent(self.path))
1218
1219 self.closed = True
1220
1221 def readable(self):
1222 """Whether opened for reading"""
1223 return self.mode == "rb" and not self.closed
1224
1225 def seekable(self):
1226 """Whether is seekable (only in read mode)"""
1227 return self.readable()
1228
1229 def writable(self):
1230 """Whether opened for writing"""
1231 return self.mode in {"wb", "ab"} and not self.closed
1232
1233 def __del__(self):
1234 self.close()
1235
1236 def __str__(self):
1237 return "<File-like object %s, %s>" % (type(self.fs).__name__, self.path)
1238
1239 __repr__ = __str__
1240
1241 def __enter__(self):
1242 return self
1243
1244 def __exit__(self, *args):
1245 self.close()
(New empty file)
0 """Tests the spec, using memoryfs"""
1
2 import os
3 import pickle
4 from fsspec.implementations.memory import MemoryFileSystem, MemoryFile
5
6
7 def test_idempotent():
8 MemoryFileSystem.clear_instance_cache()
9 fs = MemoryFileSystem()
10 fs2 = MemoryFileSystem()
11 assert fs is fs2
12 assert MemoryFileSystem.current() is fs2
13
14 MemoryFileSystem.clear_instance_cache()
15 assert not MemoryFileSystem._cache
16
17 fs2 = MemoryFileSystem().current()
18 assert fs == fs2
19
20
21 def test_pickle():
22 fs = MemoryFileSystem()
23 fs2 = pickle.loads(pickle.dumps(fs))
24 assert fs == fs2
25
26
27 def test_class_methods():
28 assert MemoryFileSystem._strip_protocol("memory:stuff") == "stuff"
29 assert MemoryFileSystem._strip_protocol("memory://stuff") == "stuff"
30 assert MemoryFileSystem._strip_protocol("stuff") == "stuff"
31 assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff"
32
33 assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {}
34
35
36 def test_get_put(tmpdir):
37 tmpdir = str(tmpdir)
38 fn = os.path.join(tmpdir, "one")
39 open(fn, "wb").write(b"one")
40 os.mkdir(os.path.join(tmpdir, "dir"))
41 fn2 = os.path.join(tmpdir, "dir", "two")
42 open(fn2, "wb").write(b"two")
43
44 fs = MemoryFileSystem()
45 fs.put(fn, "/afile")
46 assert fs.cat("/afile") == b"one"
47
48 fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data")
49 fn3 = os.path.join(tmpdir, "three")
50 fs.get("/bfile", fn3)
51 assert open(fn3, "rb").read() == b"data"
52
53 fs.put(tmpdir, "/more", recursive=True)
54 assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"]
55
56 for f in [fn, fn2, fn3]:
57 os.remove(f)
58 os.rmdir(os.path.join(tmpdir, "dir"))
59
60 fs.get("/more/", tmpdir + "/", recursive=True)
61 assert open(fn3, "rb").read() == b"data"
62 assert open(fn, "rb").read() == b"one"
63
64
65 def test_du():
66 fs = MemoryFileSystem()
67 fs.store = {
68 "/dir/afile": MemoryFile(fs, "/afile", b"a"),
69 "/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"),
70 "/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"),
71 }
72 assert fs.du("/dir") == 6
73 assert fs.du("/dir", total=False)["/dir/dirb/afile"] == 2
74 assert fs.du("/dir", maxdepth=0) == 1
75
76
77 def test_head_tail():
78 fs = MemoryFileSystem()
79 with fs.open("/myfile", "wb") as f:
80 f.write(b"I had a nice big cabbage")
81 assert fs.head("/myfile", 5) == b"I had"
82 assert fs.tail("/myfile", 7) == b"cabbage"
83
84
85 def test_move():
86 fs = MemoryFileSystem()
87 with fs.open("/myfile", "wb") as f:
88 f.write(b"I had a nice big cabbage")
89 fs.move("/myfile", "/otherfile")
90 assert not fs.exists("/myfile")
91 assert fs.info("/otherfile")
92 assert isinstance(fs.ukey("/otherfile"), str)
93
94
95 def test_read_block_delimiter():
96 fs = MemoryFileSystem()
97 with fs.open("/myfile", "wb") as f:
98 f.write(b"some\n" b"lines\n" b"of\n" b"text")
99 assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n"
100 assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n"
101 assert fs.read_block("/myfile", 6, 2, b"\n") == b""
102 assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n"
103 assert fs.read_block("/myfile", 12, 6, b"\n") == b"text"
104 assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile")
105
106
107 def test_open_text():
108 fs = MemoryFileSystem()
109 with fs.open("/myfile", "wb") as f:
110 f.write(b"some\n" b"lines\n" b"of\n" b"text")
111 f = fs.open("/myfile", "r", encoding="latin1")
112 assert f.encoding == "latin1"
0 import pathlib
1
2 import pytest
3
4 import fsspec.core
5 from fsspec.compression import compr, register_compression
6 from fsspec.utils import compressions, infer_compression
7
8
9 def test_infer_custom_compression():
10 """Inferred compression gets values from fsspec.compression.compr."""
11 assert infer_compression("fn.zip") == "zip"
12 assert infer_compression("fn.gz") == "gzip"
13 assert infer_compression("fn.unknown") is None
14 assert infer_compression("fn.test_custom") is None
15 assert infer_compression("fn.tst") is None
16
17 register_compression("test_custom", lambda f, **kwargs: f, "tst")
18
19 try:
20 assert infer_compression("fn.zip") == "zip"
21 assert infer_compression("fn.gz") == "gzip"
22 assert infer_compression("fn.unknown") is None
23 assert infer_compression("fn.test_custom") is None
24 assert infer_compression("fn.tst") == "test_custom"
25
26 # Duplicate registration in name or extension raises a value error.
27 with pytest.raises(ValueError):
28 register_compression("test_custom", lambda f, **kwargs: f, "tst")
29
30 with pytest.raises(ValueError):
31 register_compression("test_conflicting", lambda f, **kwargs: f, "tst")
32 assert "test_conflicting" not in compr
33
34 # ...but can be forced.
35 register_compression(
36 "test_conflicting", lambda f, **kwargs: f, "tst", force=True
37 )
38 assert infer_compression("fn.zip") == "zip"
39 assert infer_compression("fn.gz") == "gzip"
40 assert infer_compression("fn.unknown") is None
41 assert infer_compression("fn.test_custom") is None
42 assert infer_compression("fn.tst") == "test_conflicting"
43
44 finally:
45 del compr["test_custom"]
46 del compr["test_conflicting"]
47 del compressions["tst"]
48
49
50 def test_lzma_compression_name():
51 pytest.importorskip("lzma")
52 assert infer_compression("fn.xz") == "xz"
53
54
55 def test_lz4_compression(tmpdir):
56 """Infer lz4 compression for .lz4 files if lz4 is available."""
57 tmp_path = pathlib.Path(str(tmpdir))
58
59 lz4 = pytest.importorskip("lz4")
60
61 tmp_path.mkdir(exist_ok=True)
62
63 tdat = "foobar" * 100
64
65 with fsspec.core.open(
66 str(tmp_path / "out.lz4"), mode="wt", compression="infer"
67 ) as outfile:
68 outfile.write(tdat)
69
70 compressed = (tmp_path / "out.lz4").open("rb").read()
71 assert lz4.frame.decompress(compressed).decode() == tdat
72
73 with fsspec.core.open(
74 str(tmp_path / "out.lz4"), mode="rt", compression="infer"
75 ) as infile:
76 assert infile.read() == tdat
77
78 with fsspec.core.open(
79 str(tmp_path / "out.lz4"), mode="rt", compression="lz4"
80 ) as infile:
81 assert infile.read() == tdat
82
83
84 def test_zstd_compression(tmpdir):
85 """Infer zstd compression for .zst files if zstandard is available."""
86 tmp_path = pathlib.Path(str(tmpdir))
87
88 zstd = pytest.importorskip("zstandard")
89
90 tmp_path.mkdir(exist_ok=True)
91
92 tdat = "foobar" * 100
93
94 with fsspec.core.open(
95 str(tmp_path / "out.zst"), mode="wt", compression="infer"
96 ) as outfile:
97 outfile.write(tdat)
98
99 compressed = (tmp_path / "out.zst").open("rb").read()
100 assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat
101
102 with fsspec.core.open(
103 str(tmp_path / "out.zst"), mode="rt", compression="infer"
104 ) as infile:
105 assert infile.read() == tdat
106
107 with fsspec.core.open(
108 str(tmp_path / "out.zst"), mode="rt", compression="zstd"
109 ) as infile:
110 assert infile.read() == tdat
111
112
113 def test_snappy_compression(tmpdir):
114 """No registered compression for snappy, but can be specified."""
115 tmp_path = pathlib.Path(str(tmpdir))
116
117 snappy = pytest.importorskip("snappy")
118
119 tmp_path.mkdir(exist_ok=True)
120
121 tdat = "foobar" * 100
122
123 # Snappy isn't inferred.
124 with fsspec.core.open(
125 str(tmp_path / "out.snappy"), mode="wt", compression="infer"
126 ) as outfile:
127 outfile.write(tdat)
128 assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat
129
130 # but can be specified.
131 with fsspec.core.open(
132 str(tmp_path / "out.snappy"), mode="wt", compression="snappy"
133 ) as outfile:
134 outfile.write(tdat)
135
136 compressed = (tmp_path / "out.snappy").open("rb").read()
137 assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat
138
139 with fsspec.core.open(
140 str(tmp_path / "out.snappy"), mode="rb", compression="infer"
141 ) as infile:
142 assert infile.read() == compressed
143
144 with fsspec.core.open(
145 str(tmp_path / "out.snappy"), mode="rt", compression="snappy"
146 ) as infile:
147 assert infile.read() == tdat
0 import pytest
1 import pickle
2 import string
3
4 from fsspec.core import (
5 _expand_paths,
6 OpenFile,
7 caches,
8 get_compression,
9 BaseCache,
10 BlockCache,
11 )
12
13
14 @pytest.mark.parametrize(
15 "path, name_function, num, out",
16 [
17 [["apath"], None, 1, ["apath"]],
18 ["apath.*.csv", None, 1, ["apath.0.csv"]],
19 ["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]],
20 ["a*", lambda x: "abc"[x], 2, ["aa", "ab"]],
21 ],
22 )
23 def test_expand_paths(path, name_function, num, out):
24 assert _expand_paths(path, name_function, num) == out
25
26
27 def test_expand_error():
28 with pytest.raises(ValueError):
29 _expand_paths("*.*", None, 1)
30
31
32 def test_openfile_api(m):
33 m.open("somepath", "wb").write(b"data")
34 of = OpenFile(m, "somepath")
35 assert str(of) == "<OpenFile 'somepath'>"
36 f = of.open()
37 assert f.read() == b"data"
38 f.close()
39 with OpenFile(m, "somepath", mode="rt") as f:
40 f.read() == "data"
41
42
43 # For test_cache_pickleable(). Functions are only picklable if they are defined
44 # at the top-level of a module
45 def _fetcher(start, end):
46 return b"0" * (end - start)
47
48
49 def letters_fetcher(start, end):
50 return string.ascii_letters[start:end].encode()
51
52
53 @pytest.fixture(params=caches.values(), ids=list(caches.keys()))
54 def Cache_imp(request):
55 return request.param
56
57
58 def test_cache_empty_file(Cache_imp):
59 blocksize = 5
60 size = 0
61 cache = Cache_imp(blocksize, _fetcher, size)
62 assert cache._fetch(0, 0) == b""
63
64
65 def test_cache_pickleable(Cache_imp):
66 blocksize = 5
67 size = 100
68 cache = Cache_imp(blocksize, _fetcher, size)
69 cache._fetch(0, 5) # fill in cache
70 unpickled = pickle.loads(pickle.dumps(cache))
71 assert isinstance(unpickled, Cache_imp)
72 assert unpickled.blocksize == blocksize
73 assert unpickled.size == size
74 assert unpickled._fetch(0, 10) == b"0" * 10
75
76
77 @pytest.mark.parametrize(
78 "size_requests",
79 [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]],
80 )
81 @pytest.mark.parametrize("blocksize", [1, 10, 52, 100])
82 def test_cache_basic(Cache_imp, blocksize, size_requests):
83 cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters))
84
85 for start, end in size_requests:
86 result = cache[start:end]
87 expected = string.ascii_letters[start:end].encode()
88 assert result == expected
89
90
91 def test_xz_lzma_compressions():
92 pytest.importorskip("lzma")
93 # Ensure that both 'xz' and 'lzma' compression names can be parsed
94 assert get_compression("some_file.xz", "infer") == "xz"
95 assert get_compression("some_file.xz", "xz") == "xz"
96 assert get_compression("some_file.xz", "lzma") == "lzma"
97
98
99 def test_cache_getitem(Cache_imp):
100 cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters))
101 assert cacher[0:4] == b"abcd"
102 assert cacher[:4] == b"abcd"
103 assert cacher[-3:] == b"XYZ"
104 assert cacher[-3:-1] == b"XY"
105 assert cacher[2:4] == b"cd"
106
107
108 def test_cache_getitem_raises():
109 cacher = BaseCache(4, letters_fetcher, len(string.ascii_letters))
110 with pytest.raises(TypeError, match="int"):
111 cacher[5]
112
113 with pytest.raises(ValueError, match="contiguous"):
114 cacher[::4]
115
116
117 def test_block_cache_lru():
118 cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2)
119 # miss
120 cache[0:2]
121 assert cache.cache_info().hits == 0
122 assert cache.cache_info().misses == 1
123 assert cache.cache_info().currsize == 1
124
125 # hit
126 cache[0:2]
127 assert cache.cache_info().hits == 1
128 assert cache.cache_info().misses == 1
129 assert cache.cache_info().currsize == 1
130
131 # miss
132 cache[4:6]
133 assert cache.cache_info().hits == 1
134 assert cache.cache_info().misses == 2
135 assert cache.cache_info().currsize == 2
136
137 # miss & evict
138 cache[12:13]
139 assert cache.cache_info().hits == 1
140 assert cache.cache_info().misses == 3
141 assert cache.cache_info().currsize == 2
0 """Tests abstract buffered file API, using FTP implementation"""
1 import pickle
2 import sys
3 import pytest
4 from fsspec.implementations.tests.test_ftp import FTPFileSystem
5
6 data = b"hello" * 10000
7
8
9 @pytest.mark.xfail(
10 sys.version_info < (3, 6),
11 reason="py35 error, see https://github.com/intake/filesystem_spec/issues/147",
12 )
13 def test_pickle(ftp_writable):
14 host, port, user, pw = ftp_writable
15 ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
16
17 f = ftp.open("/out", "rb")
18
19 f2 = pickle.loads(pickle.dumps(f))
20 assert f == f2
21
22
23 def test_file_read_attributes(ftp_writable):
24 host, port, user, pw = ftp_writable
25 ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
26
27 f = ftp.open("/out", "rb")
28 assert f.info()["size"] == len(data)
29 assert f.tell() == 0
30 assert f.seekable()
31 assert f.readable()
32 assert not f.writable()
33 out = bytearray(len(data))
34
35 assert f.read() == data
36 assert f.read() == b""
37 f.seek(0)
38 assert f.readuntil(b"l") == b"hel"
39 assert f.tell() == 3
40
41 f.readinto1(out)
42 assert out[:-3] == data[3:]
43 with pytest.raises(ValueError):
44 f.write(b"")
45 f.close()
46 with pytest.raises(ValueError):
47 f.read()(b"")
48
49
50 def test_seek(ftp_writable):
51 host, port, user, pw = ftp_writable
52 ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
53
54 f = ftp.open("/out", "rb")
55
56 assert f.seek(-10, 2) == len(data) - 10
57 assert f.tell() == len(data) - 10
58 assert f.seek(-1, 1) == len(data) - 11
59 with pytest.raises(ValueError):
60 f.seek(-1)
61 with pytest.raises(ValueError):
62 f.seek(0, 7)
63
64
65 def test_file_idempotent(ftp_writable):
66 host, port, user, pw = ftp_writable
67 ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
68
69 f = ftp.open("/out", "rb")
70 f2 = ftp.open("/out", "rb")
71 assert hash(f) == hash(f2)
72 assert f == f2
73 ftp.touch("/out2")
74 f2 = ftp.open("/out2", "rb")
75 assert hash(f2) != hash(f)
76 assert f != f2
77 f2 = ftp.open("/out", "wb")
78 assert hash(f2) != hash(f)
79
80
81 def test_file_text_attributes(ftp_writable):
82 host, port, user, pw = ftp_writable
83 ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
84
85 data = b"hello\n" * 1000
86 with ftp.open("/out2", "wb") as f:
87 f.write(data)
88
89 f = ftp.open("/out2", "rb")
90 assert f.readline() == b"hello\n"
91 f.seek(0)
92 assert list(f) == [d + b"\n" for d in data.split()]
93 f.seek(0)
94 assert f.readlines() == [d + b"\n" for d in data.split()]
95
96 f = ftp.open("/out2", "rt")
97 assert f.readline() == "hello\n"
98 assert f.encoding
99
100
101 def test_file_write_attributes(ftp_writable):
102 host, port, user, pw = ftp_writable
103 ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
104 f = ftp.open("/out2", "wb")
105 with pytest.raises(ValueError):
106 f.info()
107 with pytest.raises(ValueError):
108 f.seek(0)
109 with pytest.raises(ValueError):
110 f.read(0)
111 assert not f.readable()
112 assert f.writable()
113
114 f.flush() # no-op
115
116 assert f.write(b"hello") == 5
117 assert f.write(b"hello") == 5
118 assert not f.closed
119 f.close()
120 assert f.closed
121 with pytest.raises(ValueError):
122 f.write(b"")
123 with pytest.raises(ValueError):
124 f.flush()
125
126
127 def test_midread_cache(ftp_writable):
128 host, port, user, pw = ftp_writable
129 fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
130 fn = "/myfile"
131 with fs.open(fn, "wb") as f:
132 f.write(b"a" * 175627146)
133 with fs.open(fn, "rb") as f:
134 f.seek(175561610)
135 d1 = f.read(65536)
136 assert len(d1) == 65536
137
138 f.seek(4)
139 size = 17562198
140 d2 = f.read(size)
141 assert len(d2) == size
142
143 f.seek(17562288)
144 size = 17562187
145 d3 = f.read(size)
146 assert len(d3) == size
147
148
149 def test_read_block(ftp_writable):
150 # not the same as test_read_block in test_utils, this depends on the
151 # behaviour of the bytest caching
152 from fsspec.utils import read_block
153
154 host, port, user, pw = ftp_writable
155 fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
156 fn = "/myfile"
157 with fs.open(fn, "wb") as f:
158 f.write(b"a,b\n1,2")
159 f = fs.open(fn, "rb", cache_type="bytes")
160 assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2"
161
162
163 def test_with_gzip(ftp_writable):
164 import gzip
165
166 data = b"some compressable stuff"
167 host, port, user, pw = ftp_writable
168 fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
169 fn = "/myfile"
170 with fs.open(fn, "wb") as f:
171 gf = gzip.GzipFile(fileobj=f, mode="w")
172 gf.write(data)
173 gf.close()
174 with fs.open(fn, "rb") as f:
175 gf = gzip.GzipFile(fileobj=f, mode="r")
176 assert gf.read() == data
0 import os
1 import signal
2 import time
3 from multiprocessing import Process
4
5 import pytest
6
7 pytest.importorskip("fuse") # noqa: E402
8
9 from fsspec.fuse import run
10 from fsspec.implementations.memory import MemoryFileSystem
11
12
13 def host_fuse(mountdir):
14 fs = MemoryFileSystem()
15 fs.touch("/mounted/testfile")
16 run(fs, "/mounted/", mountdir)
17
18
19 def test_basic(tmpdir):
20 mountdir = str(tmpdir.mkdir("mount"))
21
22 fuse_process = Process(target=host_fuse, args=(str(mountdir),))
23 fuse_process.start()
24
25 try:
26 timeout = 10
27 while True:
28 try:
29 # can fail with device not ready while waiting for fuse
30 if "testfile" in os.listdir(mountdir):
31 break
32 except Exception:
33 pass
34 timeout -= 1
35 time.sleep(1)
36 assert timeout > 0, "Timeout"
37
38 fn = os.path.join(mountdir, "test")
39 with open(fn, "wb") as f:
40 f.write(b"data")
41
42 with open(fn) as f:
43 assert f.read() == "data"
44
45 os.remove(fn)
46
47 os.mkdir(fn)
48 assert os.listdir(fn) == []
49
50 os.mkdir(fn + "/inner")
51
52 with pytest.raises(OSError):
53 os.rmdir(fn)
54
55 os.rmdir(fn + "/inner")
56 os.rmdir(fn)
57 finally:
58 os.kill(fuse_process.pid, signal.SIGTERM)
59 fuse_process.join()
0 import os
1 import fsspec
2 from fsspec.implementations.memory import MemoryFileSystem
3 import pickle
4 import pytest
5
6
7 def test_mapping_prefix(tmpdir):
8 tmpdir = str(tmpdir)
9 os.makedirs(os.path.join(tmpdir, "afolder"))
10 open(os.path.join(tmpdir, "afile"), "w").write("test")
11 open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
12
13 m = fsspec.get_mapper("file://" + tmpdir)
14 assert "afile" in m
15 assert m["afolder/anotherfile"] == b"test2"
16
17 fs = fsspec.filesystem("file")
18 m2 = fs.get_mapper(tmpdir)
19 m3 = fs.get_mapper("file://" + tmpdir)
20
21 assert m == m2 == m3
22
23
24 def test_ops():
25 MemoryFileSystem.store.clear()
26 m = fsspec.get_mapper("memory://")
27 assert not m
28 assert list(m) == []
29
30 with pytest.raises(KeyError):
31 m["hi"]
32
33 assert m.pop("key", 0) == 0
34
35 m["key0"] = b"data"
36 assert list(m) == ["key0"]
37 assert m["key0"] == b"data"
38
39 m.clear()
40
41 assert list(m) == []
42
43
44 def test_pickle():
45 m = fsspec.get_mapper("memory://")
46 assert isinstance(m.fs, MemoryFileSystem)
47 m["key"] = b"data"
48 m2 = pickle.loads(pickle.dumps(m))
49 assert list(m) == list(m2)
50
51
52 def test_keys_view():
53 # https://github.com/intake/filesystem_spec/issues/186
54 m = fsspec.get_mapper("memory://")
55 m["key"] = b"data"
56
57 keys = m.keys()
58 assert len(keys) == 1
59 # check that we don't consume the keys
60 assert len(keys) == 1
0 import pytest
1 from fsspec.registry import get_filesystem_class, registry
2
3
4 @pytest.mark.parametrize(
5 "protocol,module,minversion,oldversion",
6 [("s3", "s3fs", "0.3.0", "0.1.0"), ("gs", "gcsfs", "0.3.0", "0.1.0")],
7 )
8 def test_minversion_s3fs(protocol, module, minversion, oldversion, monkeypatch):
9 registry.clear()
10 mod = pytest.importorskip(module, minversion)
11
12 assert get_filesystem_class("s3") is not None
13 registry.clear()
14
15 monkeypatch.setattr(mod, "__version__", oldversion)
16 with pytest.raises(RuntimeError, match=minversion):
17 get_filesystem_class(protocol)
0 import pytest
1 from fsspec.spec import AbstractFileSystem, AbstractBufferedFile
2
3
4 class DummyTestFS(AbstractFileSystem):
5 protocol = "mock"
6 _fs_contents = (
7 {"name": "top_level/second_level/date=2019-10-01/", "type": "directory"},
8 {
9 "name": "top_level/second_level/date=2019-10-01/a.parquet",
10 "type": "file",
11 "size": 100,
12 },
13 {
14 "name": "top_level/second_level/date=2019-10-01/b.parquet",
15 "type": "file",
16 "size": 100,
17 },
18 {"name": "top_level/second_level/date=2019-10-02/", "type": "directory"},
19 {
20 "name": "top_level/second_level/date=2019-10-02/a.parquet",
21 "type": "file",
22 "size": 100,
23 },
24 {"name": "top_level/second_level/date=2019-10-04/", "type": "directory"},
25 {
26 "name": "top_level/second_level/date=2019-10-04/a.parquet",
27 "type": "file",
28 "size": 100,
29 },
30 {"name": "misc/", "type": "directory"},
31 {"name": "misc/foo.txt", "type": "file", "size": 100},
32 )
33
34 def ls(self, path, detail=True, **kwargs):
35 files = (file for file in self._fs_contents if path in file["name"])
36
37 if detail:
38 return list(files)
39
40 return list(sorted([file["name"] for file in files]))
41
42
43 @pytest.mark.parametrize(
44 "test_path, expected",
45 [
46 (
47 "mock://top_level/second_level/date=2019-10-01/a.parquet",
48 ["top_level/second_level/date=2019-10-01/a.parquet"],
49 ),
50 (
51 "mock://top_level/second_level/date=2019-10-01/*",
52 [
53 "top_level/second_level/date=2019-10-01/a.parquet",
54 "top_level/second_level/date=2019-10-01/b.parquet",
55 ],
56 ),
57 (
58 "mock://top_level/second_level/date=2019-10-0[1-4]",
59 [
60 "top_level/second_level/date=2019-10-01",
61 "top_level/second_level/date=2019-10-02",
62 "top_level/second_level/date=2019-10-04",
63 ],
64 ),
65 (
66 "mock://top_level/second_level/date=2019-10-0[1-4]/*",
67 [
68 "top_level/second_level/date=2019-10-01/a.parquet",
69 "top_level/second_level/date=2019-10-01/b.parquet",
70 "top_level/second_level/date=2019-10-02/a.parquet",
71 "top_level/second_level/date=2019-10-04/a.parquet",
72 ],
73 ),
74 (
75 "mock://top_level/second_level/date=2019-10-0[1-4]/[a].*",
76 [
77 "top_level/second_level/date=2019-10-01/a.parquet",
78 "top_level/second_level/date=2019-10-02/a.parquet",
79 "top_level/second_level/date=2019-10-04/a.parquet",
80 ],
81 ),
82 ],
83 )
84 def test_glob(test_path, expected):
85 test_fs = DummyTestFS()
86
87 assert test_fs.glob(test_path) == expected
88
89
90 def test_cache():
91 fs = DummyTestFS()
92 fs2 = DummyTestFS()
93 assert fs is fs2
94
95 assert len(fs._cache) == 1
96 del fs2
97 assert len(fs._cache) == 1
98 del fs
99 assert len(DummyTestFS._cache) == 1
100
101 DummyTestFS.clear_instance_cache()
102 assert len(DummyTestFS._cache) == 0
103
104
105 def test_alias():
106 with pytest.warns(FutureWarning, match="add_aliases"):
107 DummyTestFS(add_aliases=True)
108
109
110 def test_add_docs_warns():
111 with pytest.warns(FutureWarning, match="add_docs"):
112 AbstractFileSystem(add_docs=True)
113
114
115 def test_cache_options():
116 fs = DummyTestFS()
117 f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes")
118 assert f.cache.trim
119
120 # TODO: dummy buffered file
121 f = AbstractBufferedFile(
122 fs, "misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False)
123 )
124 assert f.cache.trim is False
125
126 f = fs.open("misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False))
127 assert f.cache.trim is False
128
129
130 def test_trim_kwarg_warns():
131 fs = DummyTestFS()
132 with pytest.warns(FutureWarning, match="cache_options"):
133 AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", trim=False)
134
135
136 def test_eq():
137 fs = DummyTestFS()
138 result = fs == 1
139 assert result is False
0 import io
1 import pytest
2 from fsspec.utils import infer_storage_options, seek_delimiter, read_block
3
4
5 def test_read_block():
6 delimiter = b"\n"
7 data = delimiter.join([b"123", b"456", b"789"])
8 f = io.BytesIO(data)
9
10 assert read_block(f, 1, 2) == b"23"
11 assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n"
12 assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n"
13 assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n"
14 assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n"
15 assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789"
16 assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789"
17 assert read_block(f, 1, 1, delimiter=b"\n") == b""
18 assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n"
19 assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789"
20
21 for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]:
22 out = [read_block(f, o, l, b"\n") for o, l in ols]
23 assert b"".join(filter(None, out)) == data
24
25
26 def test_read_block_split_before():
27 """Test start/middle/end cases of split_before.""" # noqa: I
28 d = (
29 "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000))
30 ).encode()
31
32 # Read single record at beginning.
33 # All reads include beginning of file and read through termination of
34 # delimited record.
35 assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n"
36 assert (
37 read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True)
38 == b"#header>foo0"
39 )
40 assert (
41 read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>"
42 )
43 assert (
44 read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True)
45 == b"#header>foo0\nFOOBAR0\n"
46 )
47
48 # Read multiple records at beginning.
49 # All reads include beginning of file and read through termination of
50 # delimited record.
51 assert (
52 read_block(io.BytesIO(d), 0, 27, delimiter=b"\n")
53 == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
54 )
55 assert (
56 read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True)
57 == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1"
58 )
59 assert (
60 read_block(io.BytesIO(d), 0, 27, delimiter=b">")
61 == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>"
62 )
63 assert (
64 read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True)
65 == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
66 )
67
68 # Read with offset spanning into next record, splits on either side of delimiter.
69 # Read not spanning the full record returns nothing.
70 assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n"
71 assert (
72 read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True)
73 == b"\nFOOBAR0"
74 )
75 assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b""
76 assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b""
77
78 # Read with offset spanning multiple records, splits on either side of delimiter
79 assert (
80 read_block(io.BytesIO(d), 10, 20, delimiter=b"\n")
81 == b"FOOBAR0\n>foo1\nFOOBAR1\n"
82 )
83 assert (
84 read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True)
85 == b"\nFOOBAR0\n>foo1\nFOOBAR1"
86 )
87 assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>"
88 assert (
89 read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True)
90 == b">foo1\nFOOBAR1\n"
91 )
92
93 # Read record at end, all records read to end
94
95 tlen = len(d)
96
97 assert (
98 read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n")
99 == b">foo99999\nFOOBAR99999\n"
100 )
101
102 assert (
103 read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True)
104 == b"\n>foo99999\nFOOBAR99999\n"
105 )
106
107 assert (
108 read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">")
109 == b"foo99999\nFOOBAR99999\n"
110 )
111
112 assert (
113 read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True)
114 == b">foo99999\nFOOBAR99999\n"
115 )
116
117
118 def test_seek_delimiter_endline():
119 f = io.BytesIO(b"123\n456\n789")
120
121 # if at zero, stay at zero
122 seek_delimiter(f, b"\n", 5)
123 assert f.tell() == 0
124
125 # choose the first block
126 for bs in [1, 5, 100]:
127 f.seek(1)
128 seek_delimiter(f, b"\n", blocksize=bs)
129 assert f.tell() == 4
130
131 # handle long delimiters well, even with short blocksizes
132 f = io.BytesIO(b"123abc456abc789")
133 for bs in [1, 2, 3, 4, 5, 6, 10]:
134 f.seek(1)
135 seek_delimiter(f, b"abc", blocksize=bs)
136 assert f.tell() == 6
137
138 # End at the end
139 f = io.BytesIO(b"123\n456")
140 f.seek(5)
141 seek_delimiter(f, b"\n", 5)
142 assert f.tell() == 7
143
144
145 def test_infer_options():
146 so = infer_storage_options("/mnt/datasets/test.csv")
147 assert so.pop("protocol") == "file"
148 assert so.pop("path") == "/mnt/datasets/test.csv"
149 assert not so
150
151 assert infer_storage_options("./test.csv")["path"] == "./test.csv"
152 assert infer_storage_options("../test.csv")["path"] == "../test.csv"
153
154 so = infer_storage_options("C:\\test.csv")
155 assert so.pop("protocol") == "file"
156 assert so.pop("path") == "C:\\test.csv"
157 assert not so
158
159 assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv"
160 assert infer_storage_options("\\test.csv")["path"] == "\\test.csv"
161 assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv"
162 assert infer_storage_options("test.csv")["path"] == "test.csv"
163
164 so = infer_storage_options(
165 "hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm",
166 inherit_storage_options={"extra": "value"},
167 )
168 assert so.pop("protocol") == "hdfs"
169 assert so.pop("username") == "username"
170 assert so.pop("password") == "pwd"
171 assert so.pop("host") == "Node"
172 assert so.pop("port") == 123
173 assert so.pop("path") == "/mnt/datasets/test.csv#fragm"
174 assert so.pop("url_query") == "q=1"
175 assert so.pop("url_fragment") == "fragm"
176 assert so.pop("extra") == "value"
177 assert not so
178
179 so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv")
180 assert so.pop("username") == "User-name"
181 assert so.pop("host") == "Node-name.com"
182
183 u = "http://127.0.0.1:8080/test.csv"
184 assert infer_storage_options(u) == {"protocol": "http", "path": u}
185
186 # For s3 and gcs the netloc is actually the bucket name, so we want to
187 # include it in the path. Test that:
188 # - Parsing doesn't lowercase the bucket
189 # - The bucket is included in path
190 for protocol in ["s3", "gcs", "gs"]:
191 options = infer_storage_options("%s://Bucket-name.com/test.csv" % protocol)
192 assert options["path"] == "Bucket-name.com/test.csv"
193
194 with pytest.raises(KeyError):
195 infer_storage_options("file:///bucket/file.csv", {"path": "collide"})
196 with pytest.raises(KeyError):
197 infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"})
198
199
200 @pytest.mark.parametrize(
201 "urlpath, expected_path",
202 (
203 (r"c:\foo\bar", r"c:\foo\bar"),
204 (r"C:\\foo\bar", r"C:\\foo\bar"),
205 (r"c:/foo/bar", r"c:/foo/bar"),
206 (r"file:///c|\foo\bar", r"c:\foo\bar"),
207 (r"file:///C|/foo/bar", r"C:/foo/bar"),
208 (r"file:///C:/foo/bar", r"C:/foo/bar"),
209 ),
210 )
211 def test_infer_storage_options_c(urlpath, expected_path):
212 so = infer_storage_options(urlpath)
213 assert so["protocol"] == "file"
214 assert so["path"] == expected_path
0 class Transaction(object):
1 """Filesystem transaction write context
2
3 Gathers files for deferred commit or discard, so that several write
4 operations can be finalized semi-atomically. This works by having this
5 instance as the ``.transaction`` attribute of the given filesystem
6 """
7
8 def __init__(self, fs):
9 """
10 Parameters
11 ----------
12 fs: FileSystem instance
13 """
14 self.fs = fs
15 self.files = []
16
17 def __enter__(self):
18 self.start()
19
20 def __exit__(self, exc_type, exc_val, exc_tb):
21 """End transaction and commit, if exit is not due to exception"""
22 # only commit if there was no exception
23 self.complete(commit=exc_type is None)
24 self.fs._intrans = False
25 self.fs._transaction = None
26
27 def start(self):
28 """Start a transaction on this FileSystem"""
29 self.fs._intrans = True
30
31 def complete(self, commit=True):
32 """Finish transaction: commit or discard all deferred files"""
33 for f in self.files:
34 if commit:
35 f.commit()
36 else:
37 f.discard()
38 self.files = []
39 self.fs._intrans = False
40
41
42 class FileActor(object):
43 def __init__(self):
44 self.files = []
45
46 def commit(self):
47 for f in self.files:
48 f.commit()
49 self.files.clear()
50
51 def discard(self):
52 for f in self.files:
53 f.discard()
54 self.files.clear()
55
56 def append(self, f):
57 self.files.append(f)
58
59
60 class DaskTransaction(Transaction):
61 def __init__(self, fs):
62 """
63 Parameters
64 ----------
65 fs: FileSystem instance
66 """
67 import distributed
68
69 super().__init__(fs)
70 client = distributed.default_client()
71 self.files = client.submit(FileActor, actor=True).result()
72
73 def complete(self, commit=True):
74 """Finish transaction: commit or discard all deferred files"""
75 if commit:
76 self.files.commit().result()
77 else:
78 self.files.discard().result()
79 self.fs._intrans = False
0 from hashlib import md5
1 import math
2 import os
3 import pathlib
4 import re
5 from urllib.parse import urlsplit
6
7
8 DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
9
10
11 def infer_storage_options(urlpath, inherit_storage_options=None):
12 """ Infer storage options from URL path and merge it with existing storage
13 options.
14
15 Parameters
16 ----------
17 urlpath: str or unicode
18 Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
19 inherit_storage_options: dict (optional)
20 Its contents will get merged with the inferred information from the
21 given path
22
23 Returns
24 -------
25 Storage options dict.
26
27 Examples
28 --------
29 >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
30 {"protocol": "file", "path", "/mnt/datasets/test.csv"}
31 >>> infer_storage_options(
32 ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
33 ... inherit_storage_options={'extra': 'value'}) # doctest: +SKIP
34 {"protocol": "hdfs", "username": "username", "password": "pwd",
35 "host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
36 "url_query": "q=1", "extra": "value"}
37 """
38 # Handle Windows paths including disk name in this special case
39 if re.match(r"^[a-zA-Z]:[\\/]", urlpath):
40 return {"protocol": "file", "path": urlpath}
41
42 parsed_path = urlsplit(urlpath)
43 protocol = parsed_path.scheme or "file"
44 if parsed_path.fragment:
45 path = "#".join([parsed_path.path, parsed_path.fragment])
46 else:
47 path = parsed_path.path
48 if protocol == "file":
49 # Special case parsing file protocol URL on Windows according to:
50 # https://msdn.microsoft.com/en-us/library/jj710207.aspx
51 windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path)
52 if windows_path:
53 path = "%s:%s" % windows_path.groups()
54
55 if protocol in ["http", "https"]:
56 # for HTTP, we don't want to parse, as requests will anyway
57 return {"protocol": protocol, "path": urlpath}
58
59 options = {"protocol": protocol, "path": path}
60
61 if parsed_path.netloc:
62 # Parse `hostname` from netloc manually because `parsed_path.hostname`
63 # lowercases the hostname which is not always desirable (e.g. in S3):
64 # https://github.com/dask/dask/issues/1417
65 options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
66
67 if protocol in ("s3", "gcs", "gs"):
68 options["path"] = options["host"] + options["path"]
69 else:
70 options["host"] = options["host"]
71 if parsed_path.port:
72 options["port"] = parsed_path.port
73 if parsed_path.username:
74 options["username"] = parsed_path.username
75 if parsed_path.password:
76 options["password"] = parsed_path.password
77
78 if parsed_path.query:
79 options["url_query"] = parsed_path.query
80 if parsed_path.fragment:
81 options["url_fragment"] = parsed_path.fragment
82
83 if inherit_storage_options:
84 update_storage_options(options, inherit_storage_options)
85
86 return options
87
88
89 def update_storage_options(options, inherited=None):
90 if not inherited:
91 inherited = {}
92 collisions = set(options) & set(inherited)
93 if collisions:
94 collisions = "\n".join("- %r" % k for k in collisions)
95 raise KeyError(
96 "Collision between inferred and specified storage "
97 "options:\n%s" % collisions
98 )
99 options.update(inherited)
100
101
102 # Compression extensions registered via fsspec.compression.register_compression
103 compressions = {}
104
105
106 def infer_compression(filename):
107 """Infer compression, if available, from filename.
108
109 Infer a named compression type, if registered and available, from filename
110 extension. This includes builtin (gz, bz2, zip) compressions, as well as
111 optional compressions. See fsspec.compression.register_compression.
112 """
113 extension = os.path.splitext(filename)[-1].strip(".")
114 if extension in compressions:
115 return compressions[extension]
116
117
118 def build_name_function(max_int):
119 """ Returns a function that receives a single integer
120 and returns it as a string padded by enough zero characters
121 to align with maximum possible integer
122
123 >>> name_f = build_name_function(57)
124
125 >>> name_f(7)
126 '07'
127 >>> name_f(31)
128 '31'
129 >>> build_name_function(1000)(42)
130 '0042'
131 >>> build_name_function(999)(42)
132 '042'
133 >>> build_name_function(0)(0)
134 '0'
135 """
136 # handle corner cases max_int is 0 or exact power of 10
137 max_int += 1e-8
138
139 pad_length = int(math.ceil(math.log10(max_int)))
140
141 def name_function(i):
142 return str(i).zfill(pad_length)
143
144 return name_function
145
146
147 def seek_delimiter(file, delimiter, blocksize):
148 r"""Seek current file to file start, file end, or byte after delimiter seq.
149
150 Seeks file to next chunk delimiter, where chunks are defined on file start,
151 a delimiting sequence, and file end. Use file.tell() to see location afterwards.
152 Note that file start is a valid split, so must be at offset > 0 to seek for
153 delimiter.
154
155 Parameters
156 ----------
157 file: a file
158 delimiter: bytes
159 a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
160 blocksize: int
161 Number of bytes to read from the file at once.
162
163
164 Returns
165 -------
166 Returns True if a delimiter was found, False if at file start or end.
167
168 """
169
170 if file.tell() == 0:
171 # beginning-of-file, return without seek
172 return False
173
174 # Interface is for binary IO, with delimiter as bytes, but initialize last
175 # with result of file.read to preserve compatibility with text IO.
176 last = None
177 while True:
178 current = file.read(blocksize)
179 if not current:
180 # end-of-file without delimiter
181 return False
182 full = last + current if last else current
183 try:
184 if delimiter in full:
185 i = full.index(delimiter)
186 file.seek(file.tell() - (len(full) - i) + len(delimiter))
187 return True
188 elif len(current) < blocksize:
189 # end-of-file without delimiter
190 return False
191 except (OSError, ValueError):
192 pass
193 last = full[-len(delimiter) :]
194
195
196 def read_block(f, offset, length, delimiter=None, split_before=False):
197 """ Read a block of bytes from a file
198
199 Parameters
200 ----------
201 f: File
202 Open file
203 offset: int
204 Byte offset to start read
205 length: int
206 Number of bytes to read, read through end of file if None
207 delimiter: bytes (optional)
208 Ensure reading starts and stops at delimiter bytestring
209 split_before: bool (optional)
210 Start/stop read *before* delimiter bytestring.
211
212
213 If using the ``delimiter=`` keyword argument we ensure that the read
214 starts and stops at delimiter boundaries that follow the locations
215 ``offset`` and ``offset + length``. If ``offset`` is zero then we
216 start at zero, regardless of delimiter. The bytestring returned WILL
217 include the terminating delimiter string.
218
219 Examples
220 --------
221
222 >>> from io import BytesIO # doctest: +SKIP
223 >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
224 >>> read_block(f, 0, 13) # doctest: +SKIP
225 b'Alice, 100\\nBo'
226
227 >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
228 b'Alice, 100\\nBob, 200\\n'
229
230 >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
231 b'Bob, 200\\nCharlie, 300'
232 """
233 if delimiter:
234 f.seek(offset)
235 found_start_delim = seek_delimiter(f, delimiter, 2 ** 16)
236 if length is None:
237 return f.read()
238 start = f.tell()
239 length -= start - offset
240
241 f.seek(start + length)
242 found_end_delim = seek_delimiter(f, delimiter, 2 ** 16)
243 end = f.tell()
244
245 # Adjust split location to before delimiter iff seek found the
246 # delimiter sequence, not start or end of file.
247 if found_start_delim and split_before:
248 start -= len(delimiter)
249
250 if found_end_delim and split_before:
251 end -= len(delimiter)
252
253 offset = start
254 length = end - start
255
256 f.seek(offset)
257 b = f.read(length)
258 return b
259
260
261 def tokenize(*args, **kwargs):
262 """ Deterministic token
263
264 (modified from dask.base)
265
266 >>> tokenize([1, 2, '3'])
267 '9d71491b50023b06fc76928e6eddb952'
268
269 >>> tokenize('Hello') == tokenize('Hello')
270 True
271 """
272 if kwargs:
273 args += (kwargs,)
274 return md5(str(args).encode()).hexdigest()
275
276
277 def stringify_path(filepath):
278 """ Attempt to convert a path-like object to a string.
279
280 Parameters
281 ----------
282 filepath: object to be converted
283
284 Returns
285 -------
286 filepath_str: maybe a string version of the object
287
288 Notes
289 -----
290 Objects supporting the fspath protocol (Python 3.6+) are coerced
291 according to its __fspath__ method.
292
293 For backwards compatibility with older Python version, pathlib.Path
294 objects are specially coerced.
295
296 Any other object is passed through unchanged, which includes bytes,
297 strings, buffers, or anything else that's not even path-like.
298 """
299 if hasattr(filepath, "__fspath__"):
300 return filepath.__fspath__()
301 elif isinstance(filepath, pathlib.Path):
302 return str(filepath)
303 return filepath
0 [tool.black]
1 # Revert to py34 target syntax to accomodate
2 # errors in trailing commas.
3 # https://github.com/psf/black/pull/763
4 target_version = ['py34']
0 conda:
1 file: docs/environment.yml
(New empty file)
0 [metadata]
1 long_description: file: README.rst
2
3 [versioneer]
4 VCS = git
5 style = pep440
6 versionfile_source = fsspec/_version.py
7 versionfile_build = fsspec/_version.py
8 tag_prefix = ""
9
10 [flake8]
11 exclude = .tox,build,docs/source/conf.py,versioneer.py
12 max-line-length = 88
13 ignore =
14 # Assigning lambda expression
15 E731
16 # Ambiguous variable names
17 E741
18 # line break before binary operator
19 W503
20 # whitespace before :
21 E203
0 #!/usr/bin/env python
1 import os
2
3 from setuptools import setup
4 import versioneer
5
6 here = os.path.abspath(os.path.dirname(__file__))
7 with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
8 long_description = f.read()
9
10 setup(
11 name="fsspec",
12 version=versioneer.get_version(),
13 cmdclass=versioneer.get_cmdclass(),
14 classifiers=[
15 "Development Status :: 4 - Beta",
16 "Intended Audience :: Developers",
17 "License :: OSI Approved :: BSD License",
18 "Operating System :: OS Independent",
19 "Programming Language :: Python :: 3.5",
20 "Programming Language :: Python :: 3.6",
21 "Programming Language :: Python :: 3.7",
22 ],
23 description="File-system specification",
24 long_description=long_description,
25 long_description_content_type="text/markdown",
26 url="http://github.com/intake/filesystem_spec",
27 maintainer="Martin Durant",
28 maintainer_email="mdurant@anaconda.com",
29 license="BSD",
30 keywords="file",
31 packages=["fsspec", "fsspec.implementations"],
32 python_requires=">=3.5",
33 install_requires=open("requirements.txt").read().strip().split("\n"),
34 zip_safe=False,
35 )
0 # content of: tox.ini , put in same dir as setup.py
1 [tox]
2 envlist = {py35,py36,py37}
3
4 [core]
5 conda_channels=
6 defaults
7 conda-forge
8 conda_deps=
9 pip
10 paramiko
11 requests
12 zstandard
13 python-snappy
14 lz4
15 distributed
16 dask
17 pyarrow
18 pyftpdlib
19 cloudpickle
20 pytest
21 pytest-cov
22 fusepy==3.0.1
23 deps=
24 hadoop-test-cluster==0.1.0
25
26 [dev]
27 conda_deps=
28 conda-forge::pre-commit=1.18
29 black=19.3b0
30 flake8
31 deps=
32
33 [testenv]
34 description=Run test suite against target versions.
35 conda_channels=
36 {[core]conda_channels}
37 conda_deps=
38 {[core]conda_deps}
39 deps=
40 {[core]deps}
41 commands =
42 py.test -v -r s
43
44 [testenv:coverage]
45 description=Run test suite with coverage enabled.
46 basepython=python3.7
47 conda_channels=
48 {[core]conda_channels}
49 conda_deps=
50 {[core]conda_deps}
51 deps=
52 {[core]deps}
53 commands =
54 py.test --cov=fsspec -v -r s
55
56 [testenv:dev]
57 description=Setup conda dev env under '.tox/dev'.
58 basepython=python3.7
59 usedevelop=True
60 conda_channels=
61 {[core]conda_channels}
62 conda_deps=
63 {[core]conda_deps}
64 {[dev]conda_deps}
65 deps=
66 {[core]deps}
67 {[dev]deps}
68 commands =
69
70 [testenv:lint]
71 description=Run pre-commit checks.
72 basepython=python3.7
73 skip_install=True
74 conda_deps=
75 {[dev]conda_deps}
76 deps=
77 {[dev]deps}
78 commands_pre=
79 pre-commit install --install-hooks
80 commands=
81 pre-commit run --all-files --show-diff-on-failure
82
83 [testenv:s3fs]
84 description=Run s3fs (@master) test suite against fsspec.
85 conda_channels=
86 defaults
87 conda-forge
88 conda_deps=
89 {[core]conda_deps}
90 boto3
91 botocore
92 httpretty
93 moto
94 six
95 mock
96 deps=
97 {[core]deps}
98 changedir=.tox/s3fs/tmp
99 whitelist_externals=
100 rm
101 git
102 setenv=
103 BOTO_CONFIG=/dev/null
104 AWS_ACCESS_KEY_ID=foobar_key
105 AWS_SECRET_ACCESS_KEY=foobar_secret
106 commands=
107 rm -rf s3fs
108 git clone https://github.com/dask/s3fs
109 py.test -vv s3fs/s3fs
110
111 [testenv:gcsfs]
112 description=Run gcsfs (@master) test suite against fsspec.
113 conda_channels=
114 defaults
115 conda-forge
116 conda_deps=
117 {[core]conda_deps}
118 requests
119 decorator
120 google-auth
121 deps=
122 {[core]deps}
123 vcrpy
124 google-auth-oauthlib
125 changedir=.tox/gcsfs/tmp
126 whitelist_externals=
127 rm
128 git
129 setenv=
130 GCSFS_RECORD_MODE=none
131 commands=
132 rm -rf gcsfs
133 git clone https://github.com/dask/gcsfs
134 py.test -vv gcsfs/gcsfs -k 'not fuse'
0 # Version: 0.18
1
2 """The Versioneer - like a rocketeer, but for versions.
3
4 The Versioneer
5 ==============
6
7 * like a rocketeer, but for versions!
8 * https://github.com/warner/python-versioneer
9 * Brian Warner
10 * License: Public Domain
11 * Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
12 * [![Latest Version]
13 (https://pypip.in/version/versioneer/badge.svg?style=flat)
14 ](https://pypi.python.org/pypi/versioneer/)
15 * [![Build Status]
16 (https://travis-ci.org/warner/python-versioneer.png?branch=master)
17 ](https://travis-ci.org/warner/python-versioneer)
18
19 This is a tool for managing a recorded version number in distutils-based
20 python projects. The goal is to remove the tedious and error-prone "update
21 the embedded version string" step from your release process. Making a new
22 release should be as easy as recording a new tag in your version-control
23 system, and maybe making new tarballs.
24
25
26 ## Quick Install
27
28 * `pip install versioneer` to somewhere to your $PATH
29 * add a `[versioneer]` section to your setup.cfg (see below)
30 * run `versioneer install` in your source tree, commit the results
31
32 ## Version Identifiers
33
34 Source trees come from a variety of places:
35
36 * a version-control system checkout (mostly used by developers)
37 * a nightly tarball, produced by build automation
38 * a snapshot tarball, produced by a web-based VCS browser, like github's
39 "tarball from tag" feature
40 * a release tarball, produced by "setup.py sdist", distributed through PyPI
41
42 Within each source tree, the version identifier (either a string or a number,
43 this tool is format-agnostic) can come from a variety of places:
44
45 * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
46 about recent "tags" and an absolute revision-id
47 * the name of the directory into which the tarball was unpacked
48 * an expanded VCS keyword ($Id$, etc)
49 * a `_version.py` created by some earlier build step
50
51 For released software, the version identifier is closely related to a VCS
52 tag. Some projects use tag names that include more than just the version
53 string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
54 needs to strip the tag prefix to extract the version identifier. For
55 unreleased software (between tags), the version identifier should provide
56 enough information to help developers recreate the same tree, while also
57 giving them an idea of roughly how old the tree is (after version 1.2, before
58 version 1.3). Many VCS systems can report a description that captures this,
59 for example `git describe --tags --dirty --always` reports things like
60 "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
61 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
62 uncommitted changes.
63
64 The version identifier is used for multiple purposes:
65
66 * to allow the module to self-identify its version: `myproject.__version__`
67 * to choose a name and prefix for a 'setup.py sdist' tarball
68
69 ## Theory of Operation
70
71 Versioneer works by adding a special `_version.py` file into your source
72 tree, where your `__init__.py` can import it. This `_version.py` knows how to
73 dynamically ask the VCS tool for version information at import time.
74
75 `_version.py` also contains `$Revision$` markers, and the installation
76 process marks `_version.py` to have this marker rewritten with a tag name
77 during the `git archive` command. As a result, generated tarballs will
78 contain enough information to get the proper version.
79
80 To allow `setup.py` to compute a version too, a `versioneer.py` is added to
81 the top level of your source tree, next to `setup.py` and the `setup.cfg`
82 that configures it. This overrides several distutils/setuptools commands to
83 compute the version when invoked, and changes `setup.py build` and `setup.py
84 sdist` to replace `_version.py` with a small static file that contains just
85 the generated version data.
86
87 ## Installation
88
89 See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
90
91 ## Version-String Flavors
92
93 Code which uses Versioneer can learn about its version string at runtime by
94 importing `_version` from your main `__init__.py` file and running the
95 `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
96 import the top-level `versioneer.py` and run `get_versions()`.
97
98 Both functions return a dictionary with different flavors of version
99 information:
100
101 * `['version']`: A condensed version string, rendered using the selected
102 style. This is the most commonly used value for the project's version
103 string. The default "pep440" style yields strings like `0.11`,
104 `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
105 below for alternative styles.
106
107 * `['full-revisionid']`: detailed revision identifier. For Git, this is the
108 full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
109
110 * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
111 commit date in ISO 8601 format. This will be None if the date is not
112 available.
113
114 * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
115 this is only accurate if run in a VCS checkout, otherwise it is likely to
116 be False or None
117
118 * `['error']`: if the version string could not be computed, this will be set
119 to a string describing the problem, otherwise it will be None. It may be
120 useful to throw an exception in setup.py if this is set, to avoid e.g.
121 creating tarballs with a version string of "unknown".
122
123 Some variants are more useful than others. Including `full-revisionid` in a
124 bug report should allow developers to reconstruct the exact code being tested
125 (or indicate the presence of local changes that should be shared with the
126 developers). `version` is suitable for display in an "about" box or a CLI
127 `--version` output: it can be easily compared against release notes and lists
128 of bugs fixed in various releases.
129
130 The installer adds the following text to your `__init__.py` to place a basic
131 version in `YOURPROJECT.__version__`:
132
133 from ._version import get_versions
134 __version__ = get_versions()['version']
135 del get_versions
136
137 ## Styles
138
139 The setup.cfg `style=` configuration controls how the VCS information is
140 rendered into a version string.
141
142 The default style, "pep440", produces a PEP440-compliant string, equal to the
143 un-prefixed tag name for actual releases, and containing an additional "local
144 version" section with more detail for in-between builds. For Git, this is
145 TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
146 --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
147 tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
148 that this commit is two revisions ("+2") beyond the "0.11" tag. For released
149 software (exactly equal to a known tag), the identifier will only contain the
150 stripped tag, e.g. "0.11".
151
152 Other styles are available. See [details.md](details.md) in the Versioneer
153 source tree for descriptions.
154
155 ## Debugging
156
157 Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
158 to return a version of "0+unknown". To investigate the problem, run `setup.py
159 version`, which will run the version-lookup code in a verbose mode, and will
160 display the full contents of `get_versions()` (including the `error` string,
161 which may help identify what went wrong).
162
163 ## Known Limitations
164
165 Some situations are known to cause problems for Versioneer. This details the
166 most significant ones. More can be found on Github
167 [issues page](https://github.com/warner/python-versioneer/issues).
168
169 ### Subprojects
170
171 Versioneer has limited support for source trees in which `setup.py` is not in
172 the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
173 two common reasons why `setup.py` might not be in the root:
174
175 * Source trees which contain multiple subprojects, such as
176 [Buildbot](https://github.com/buildbot/buildbot), which contains both
177 "master" and "slave" subprojects, each with their own `setup.py`,
178 `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
179 distributions (and upload multiple independently-installable tarballs).
180 * Source trees whose main purpose is to contain a C library, but which also
181 provide bindings to Python (and perhaps other langauges) in subdirectories.
182
183 Versioneer will look for `.git` in parent directories, and most operations
184 should get the right version string. However `pip` and `setuptools` have bugs
185 and implementation details which frequently cause `pip install .` from a
186 subproject directory to fail to find a correct version string (so it usually
187 defaults to `0+unknown`).
188
189 `pip install --editable .` should work correctly. `setup.py install` might
190 work too.
191
192 Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
193 some later version.
194
195 [Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
196 this issue. The discussion in
197 [PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
198 issue from the Versioneer side in more detail.
199 [pip PR#3176](https://github.com/pypa/pip/pull/3176) and
200 [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
201 pip to let Versioneer work correctly.
202
203 Versioneer-0.16 and earlier only looked for a `.git` directory next to the
204 `setup.cfg`, so subprojects were completely unsupported with those releases.
205
206 ### Editable installs with setuptools <= 18.5
207
208 `setup.py develop` and `pip install --editable .` allow you to install a
209 project into a virtualenv once, then continue editing the source code (and
210 test) without re-installing after every change.
211
212 "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
213 convenient way to specify executable scripts that should be installed along
214 with the python package.
215
216 These both work as expected when using modern setuptools. When using
217 setuptools-18.5 or earlier, however, certain operations will cause
218 `pkg_resources.DistributionNotFound` errors when running the entrypoint
219 script, which must be resolved by re-installing the package. This happens
220 when the install happens with one version, then the egg_info data is
221 regenerated while a different version is checked out. Many setup.py commands
222 cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
223 a different virtualenv), so this can be surprising.
224
225 [Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
226 this one, but upgrading to a newer version of setuptools should probably
227 resolve it.
228
229 ### Unicode version strings
230
231 While Versioneer works (and is continually tested) with both Python 2 and
232 Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
233 Newer releases probably generate unicode version strings on py2. It's not
234 clear that this is wrong, but it may be surprising for applications when then
235 write these strings to a network connection or include them in bytes-oriented
236 APIs like cryptographic checksums.
237
238 [Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
239 this question.
240
241
242 ## Updating Versioneer
243
244 To upgrade your project to a new release of Versioneer, do the following:
245
246 * install the new Versioneer (`pip install -U versioneer` or equivalent)
247 * edit `setup.cfg`, if necessary, to include any new configuration settings
248 indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
249 * re-run `versioneer install` in your source tree, to replace
250 `SRC/_version.py`
251 * commit any changed files
252
253 ## Future Directions
254
255 This tool is designed to make it easily extended to other version-control
256 systems: all VCS-specific components are in separate directories like
257 src/git/ . The top-level `versioneer.py` script is assembled from these
258 components by running make-versioneer.py . In the future, make-versioneer.py
259 will take a VCS name as an argument, and will construct a version of
260 `versioneer.py` that is specific to the given VCS. It might also take the
261 configuration arguments that are currently provided manually during
262 installation by editing setup.py . Alternatively, it might go the other
263 direction and include code from all supported VCS systems, reducing the
264 number of intermediate scripts.
265
266
267 ## License
268
269 To make Versioneer easier to embed, all its code is dedicated to the public
270 domain. The `_version.py` that it creates is also in the public domain.
271 Specifically, both are released under the Creative Commons "Public Domain
272 Dedication" license (CC0-1.0), as described in
273 https://creativecommons.org/publicdomain/zero/1.0/ .
274
275 """
276
277 from __future__ import print_function
278
279 try:
280 import configparser
281 except ImportError:
282 import ConfigParser as configparser
283 import errno
284 import json
285 import os
286 import re
287 import subprocess
288 import sys
289
290
291 class VersioneerConfig:
292 """Container for Versioneer configuration parameters."""
293
294
295 def get_root():
296 """Get the project root directory.
297
298 We require that all commands are run from the project root, i.e. the
299 directory that contains setup.py, setup.cfg, and versioneer.py .
300 """
301 root = os.path.realpath(os.path.abspath(os.getcwd()))
302 setup_py = os.path.join(root, "setup.py")
303 versioneer_py = os.path.join(root, "versioneer.py")
304 if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
305 # allow 'python path/to/setup.py COMMAND'
306 root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
307 setup_py = os.path.join(root, "setup.py")
308 versioneer_py = os.path.join(root, "versioneer.py")
309 if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
310 err = (
311 "Versioneer was unable to run the project root directory. "
312 "Versioneer requires setup.py to be executed from "
313 "its immediate directory (like 'python setup.py COMMAND'), "
314 "or in a way that lets it use sys.argv[0] to find the root "
315 "(like 'python path/to/setup.py COMMAND')."
316 )
317 raise VersioneerBadRootError(err)
318 try:
319 # Certain runtime workflows (setup.py install/develop in a setuptools
320 # tree) execute all dependencies in a single python process, so
321 # "versioneer" may be imported multiple times, and python's shared
322 # module-import table will cache the first one. So we can't use
323 # os.path.dirname(__file__), as that will find whichever
324 # versioneer.py was first imported, even in later projects.
325 me = os.path.realpath(os.path.abspath(__file__))
326 me_dir = os.path.normcase(os.path.splitext(me)[0])
327 vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
328 if me_dir != vsr_dir:
329 print(
330 "Warning: build in %s is using versioneer.py from %s"
331 % (os.path.dirname(me), versioneer_py)
332 )
333 except NameError:
334 pass
335 return root
336
337
338 def get_config_from_root(root):
339 """Read the project setup.cfg file to determine Versioneer config."""
340 # This might raise EnvironmentError (if setup.cfg is missing), or
341 # configparser.NoSectionError (if it lacks a [versioneer] section), or
342 # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
343 # the top of versioneer.py for instructions on writing your setup.cfg .
344 setup_cfg = os.path.join(root, "setup.cfg")
345 parser = configparser.SafeConfigParser()
346 with open(setup_cfg, "r") as f:
347 parser.readfp(f)
348 VCS = parser.get("versioneer", "VCS") # mandatory
349
350 def get(parser, name):
351 if parser.has_option("versioneer", name):
352 return parser.get("versioneer", name)
353 return None
354
355 cfg = VersioneerConfig()
356 cfg.VCS = VCS
357 cfg.style = get(parser, "style") or ""
358 cfg.versionfile_source = get(parser, "versionfile_source")
359 cfg.versionfile_build = get(parser, "versionfile_build")
360 cfg.tag_prefix = get(parser, "tag_prefix")
361 if cfg.tag_prefix in ("''", '""'):
362 cfg.tag_prefix = ""
363 cfg.parentdir_prefix = get(parser, "parentdir_prefix")
364 cfg.verbose = get(parser, "verbose")
365 return cfg
366
367
368 class NotThisMethod(Exception):
369 """Exception raised if a method is not valid for the current scenario."""
370
371
372 # these dictionaries contain VCS-specific tools
373 LONG_VERSION_PY = {}
374 HANDLERS = {}
375
376
377 def register_vcs_handler(vcs, method): # decorator
378 """Decorator to mark a method as the handler for a particular VCS."""
379
380 def decorate(f):
381 """Store f in HANDLERS[vcs][method]."""
382 if vcs not in HANDLERS:
383 HANDLERS[vcs] = {}
384 HANDLERS[vcs][method] = f
385 return f
386
387 return decorate
388
389
390 def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
391 """Call the given command(s)."""
392 assert isinstance(commands, list)
393 p = None
394 for c in commands:
395 try:
396 dispcmd = str([c] + args)
397 # remember shell=False, so use git.cmd on windows, not just git
398 p = subprocess.Popen(
399 [c] + args,
400 cwd=cwd,
401 env=env,
402 stdout=subprocess.PIPE,
403 stderr=(subprocess.PIPE if hide_stderr else None),
404 )
405 break
406 except EnvironmentError:
407 e = sys.exc_info()[1]
408 if e.errno == errno.ENOENT:
409 continue
410 if verbose:
411 print("unable to run %s" % dispcmd)
412 print(e)
413 return None, None
414 else:
415 if verbose:
416 print("unable to find command, tried %s" % (commands,))
417 return None, None
418 stdout = p.communicate()[0].strip()
419 if sys.version_info[0] >= 3:
420 stdout = stdout.decode()
421 if p.returncode != 0:
422 if verbose:
423 print("unable to run %s (error)" % dispcmd)
424 print("stdout was %s" % stdout)
425 return None, p.returncode
426 return stdout, p.returncode
427
428
429 LONG_VERSION_PY[
430 "git"
431 ] = '''
432 # This file helps to compute a version number in source trees obtained from
433 # git-archive tarball (such as those provided by githubs download-from-tag
434 # feature). Distribution tarballs (built by setup.py sdist) and build
435 # directories (produced by setup.py build) will contain a much shorter file
436 # that just contains the computed version number.
437
438 # This file is released into the public domain. Generated by
439 # versioneer-0.18 (https://github.com/warner/python-versioneer)
440
441 """Git implementation of _version.py."""
442
443 import errno
444 import os
445 import re
446 import subprocess
447 import sys
448
449
450 def get_keywords():
451 """Get the keywords needed to look up the version information."""
452 # these strings will be replaced by git during git-archive.
453 # setup.py/versioneer.py will grep for the variable names, so they must
454 # each be defined on a line of their own. _version.py will just call
455 # get_keywords().
456 git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
457 git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
458 git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
459 keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
460 return keywords
461
462
463 class VersioneerConfig:
464 """Container for Versioneer configuration parameters."""
465
466
467 def get_config():
468 """Create, populate and return the VersioneerConfig() object."""
469 # these strings are filled in when 'setup.py versioneer' creates
470 # _version.py
471 cfg = VersioneerConfig()
472 cfg.VCS = "git"
473 cfg.style = "%(STYLE)s"
474 cfg.tag_prefix = "%(TAG_PREFIX)s"
475 cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
476 cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
477 cfg.verbose = False
478 return cfg
479
480
481 class NotThisMethod(Exception):
482 """Exception raised if a method is not valid for the current scenario."""
483
484
485 LONG_VERSION_PY = {}
486 HANDLERS = {}
487
488
489 def register_vcs_handler(vcs, method): # decorator
490 """Decorator to mark a method as the handler for a particular VCS."""
491 def decorate(f):
492 """Store f in HANDLERS[vcs][method]."""
493 if vcs not in HANDLERS:
494 HANDLERS[vcs] = {}
495 HANDLERS[vcs][method] = f
496 return f
497 return decorate
498
499
500 def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
501 env=None):
502 """Call the given command(s)."""
503 assert isinstance(commands, list)
504 p = None
505 for c in commands:
506 try:
507 dispcmd = str([c] + args)
508 # remember shell=False, so use git.cmd on windows, not just git
509 p = subprocess.Popen([c] + args, cwd=cwd, env=env,
510 stdout=subprocess.PIPE,
511 stderr=(subprocess.PIPE if hide_stderr
512 else None))
513 break
514 except EnvironmentError:
515 e = sys.exc_info()[1]
516 if e.errno == errno.ENOENT:
517 continue
518 if verbose:
519 print("unable to run %%s" %% dispcmd)
520 print(e)
521 return None, None
522 else:
523 if verbose:
524 print("unable to find command, tried %%s" %% (commands,))
525 return None, None
526 stdout = p.communicate()[0].strip()
527 if sys.version_info[0] >= 3:
528 stdout = stdout.decode()
529 if p.returncode != 0:
530 if verbose:
531 print("unable to run %%s (error)" %% dispcmd)
532 print("stdout was %%s" %% stdout)
533 return None, p.returncode
534 return stdout, p.returncode
535
536
537 def versions_from_parentdir(parentdir_prefix, root, verbose):
538 """Try to determine the version from the parent directory name.
539
540 Source tarballs conventionally unpack into a directory that includes both
541 the project name and a version string. We will also support searching up
542 two directory levels for an appropriately named parent directory
543 """
544 rootdirs = []
545
546 for i in range(3):
547 dirname = os.path.basename(root)
548 if dirname.startswith(parentdir_prefix):
549 return {"version": dirname[len(parentdir_prefix):],
550 "full-revisionid": None,
551 "dirty": False, "error": None, "date": None}
552 else:
553 rootdirs.append(root)
554 root = os.path.dirname(root) # up a level
555
556 if verbose:
557 print("Tried directories %%s but none started with prefix %%s" %%
558 (str(rootdirs), parentdir_prefix))
559 raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
560
561
562 @register_vcs_handler("git", "get_keywords")
563 def git_get_keywords(versionfile_abs):
564 """Extract version information from the given file."""
565 # the code embedded in _version.py can just fetch the value of these
566 # keywords. When used from setup.py, we don't want to import _version.py,
567 # so we do it with a regexp instead. This function is not used from
568 # _version.py.
569 keywords = {}
570 try:
571 f = open(versionfile_abs, "r")
572 for line in f.readlines():
573 if line.strip().startswith("git_refnames ="):
574 mo = re.search(r'=\s*"(.*)"', line)
575 if mo:
576 keywords["refnames"] = mo.group(1)
577 if line.strip().startswith("git_full ="):
578 mo = re.search(r'=\s*"(.*)"', line)
579 if mo:
580 keywords["full"] = mo.group(1)
581 if line.strip().startswith("git_date ="):
582 mo = re.search(r'=\s*"(.*)"', line)
583 if mo:
584 keywords["date"] = mo.group(1)
585 f.close()
586 except EnvironmentError:
587 pass
588 return keywords
589
590
591 @register_vcs_handler("git", "keywords")
592 def git_versions_from_keywords(keywords, tag_prefix, verbose):
593 """Get version information from git keywords."""
594 if not keywords:
595 raise NotThisMethod("no keywords at all, weird")
596 date = keywords.get("date")
597 if date is not None:
598 # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
599 # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
600 # -like" string, which we must then edit to make compliant), because
601 # it's been around since git-1.5.3, and it's too difficult to
602 # discover which version we're using, or to work around using an
603 # older one.
604 date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
605 refnames = keywords["refnames"].strip()
606 if refnames.startswith("$Format"):
607 if verbose:
608 print("keywords are unexpanded, not using")
609 raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
610 refs = set([r.strip() for r in refnames.strip("()").split(",")])
611 # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
612 # just "foo-1.0". If we see a "tag: " prefix, prefer those.
613 TAG = "tag: "
614 tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
615 if not tags:
616 # Either we're using git < 1.8.3, or there really are no tags. We use
617 # a heuristic: assume all version tags have a digit. The old git %%d
618 # expansion behaves like git log --decorate=short and strips out the
619 # refs/heads/ and refs/tags/ prefixes that would let us distinguish
620 # between branches and tags. By ignoring refnames without digits, we
621 # filter out many common branch names like "release" and
622 # "stabilization", as well as "HEAD" and "master".
623 tags = set([r for r in refs if re.search(r'\d', r)])
624 if verbose:
625 print("discarding '%%s', no digits" %% ",".join(refs - tags))
626 if verbose:
627 print("likely tags: %%s" %% ",".join(sorted(tags)))
628 for ref in sorted(tags):
629 # sorting will prefer e.g. "2.0" over "2.0rc1"
630 if ref.startswith(tag_prefix):
631 r = ref[len(tag_prefix):]
632 if verbose:
633 print("picking %%s" %% r)
634 return {"version": r,
635 "full-revisionid": keywords["full"].strip(),
636 "dirty": False, "error": None,
637 "date": date}
638 # no suitable tags, so version is "0+unknown", but full hex is still there
639 if verbose:
640 print("no suitable tags, using unknown + full revision id")
641 return {"version": "0+unknown",
642 "full-revisionid": keywords["full"].strip(),
643 "dirty": False, "error": "no suitable tags", "date": None}
644
645
646 @register_vcs_handler("git", "pieces_from_vcs")
647 def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
648 """Get version from 'git describe' in the root of the source tree.
649
650 This only gets called if the git-archive 'subst' keywords were *not*
651 expanded, and _version.py hasn't already been rewritten with a short
652 version string, meaning we're inside a checked out source tree.
653 """
654 GITS = ["git"]
655 if sys.platform == "win32":
656 GITS = ["git.cmd", "git.exe"]
657
658 out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
659 hide_stderr=True)
660 if rc != 0:
661 if verbose:
662 print("Directory %%s not under git control" %% root)
663 raise NotThisMethod("'git rev-parse --git-dir' returned error")
664
665 # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
666 # if there isn't one, this yields HEX[-dirty] (no NUM)
667 describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
668 "--always", "--long",
669 "--match", "%%s*" %% tag_prefix],
670 cwd=root)
671 # --long was added in git-1.5.5
672 if describe_out is None:
673 raise NotThisMethod("'git describe' failed")
674 describe_out = describe_out.strip()
675 full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
676 if full_out is None:
677 raise NotThisMethod("'git rev-parse' failed")
678 full_out = full_out.strip()
679
680 pieces = {}
681 pieces["long"] = full_out
682 pieces["short"] = full_out[:7] # maybe improved later
683 pieces["error"] = None
684
685 # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
686 # TAG might have hyphens.
687 git_describe = describe_out
688
689 # look for -dirty suffix
690 dirty = git_describe.endswith("-dirty")
691 pieces["dirty"] = dirty
692 if dirty:
693 git_describe = git_describe[:git_describe.rindex("-dirty")]
694
695 # now we have TAG-NUM-gHEX or HEX
696
697 if "-" in git_describe:
698 # TAG-NUM-gHEX
699 mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
700 if not mo:
701 # unparseable. Maybe git-describe is misbehaving?
702 pieces["error"] = ("unable to parse git-describe output: '%%s'"
703 %% describe_out)
704 return pieces
705
706 # tag
707 full_tag = mo.group(1)
708 if not full_tag.startswith(tag_prefix):
709 if verbose:
710 fmt = "tag '%%s' doesn't start with prefix '%%s'"
711 print(fmt %% (full_tag, tag_prefix))
712 pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
713 %% (full_tag, tag_prefix))
714 return pieces
715 pieces["closest-tag"] = full_tag[len(tag_prefix):]
716
717 # distance: number of commits since tag
718 pieces["distance"] = int(mo.group(2))
719
720 # commit: short hex revision ID
721 pieces["short"] = mo.group(3)
722
723 else:
724 # HEX: no tags
725 pieces["closest-tag"] = None
726 count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
727 cwd=root)
728 pieces["distance"] = int(count_out) # total number of commits
729
730 # commit date: see ISO-8601 comment in git_versions_from_keywords()
731 date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
732 cwd=root)[0].strip()
733 pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
734
735 return pieces
736
737
738 def plus_or_dot(pieces):
739 """Return a + if we don't already have one, else return a ."""
740 if "+" in pieces.get("closest-tag", ""):
741 return "."
742 return "+"
743
744
745 def render_pep440(pieces):
746 """Build up version string, with post-release "local version identifier".
747
748 Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
749 get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
750
751 Exceptions:
752 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
753 """
754 if pieces["closest-tag"]:
755 rendered = pieces["closest-tag"]
756 if pieces["distance"] or pieces["dirty"]:
757 rendered += plus_or_dot(pieces)
758 rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
759 if pieces["dirty"]:
760 rendered += ".dirty"
761 else:
762 # exception #1
763 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
764 pieces["short"])
765 if pieces["dirty"]:
766 rendered += ".dirty"
767 return rendered
768
769
770 def render_pep440_pre(pieces):
771 """TAG[.post.devDISTANCE] -- No -dirty.
772
773 Exceptions:
774 1: no tags. 0.post.devDISTANCE
775 """
776 if pieces["closest-tag"]:
777 rendered = pieces["closest-tag"]
778 if pieces["distance"]:
779 rendered += ".post.dev%%d" %% pieces["distance"]
780 else:
781 # exception #1
782 rendered = "0.post.dev%%d" %% pieces["distance"]
783 return rendered
784
785
786 def render_pep440_post(pieces):
787 """TAG[.postDISTANCE[.dev0]+gHEX] .
788
789 The ".dev0" means dirty. Note that .dev0 sorts backwards
790 (a dirty tree will appear "older" than the corresponding clean one),
791 but you shouldn't be releasing software with -dirty anyways.
792
793 Exceptions:
794 1: no tags. 0.postDISTANCE[.dev0]
795 """
796 if pieces["closest-tag"]:
797 rendered = pieces["closest-tag"]
798 if pieces["distance"] or pieces["dirty"]:
799 rendered += ".post%%d" %% pieces["distance"]
800 if pieces["dirty"]:
801 rendered += ".dev0"
802 rendered += plus_or_dot(pieces)
803 rendered += "g%%s" %% pieces["short"]
804 else:
805 # exception #1
806 rendered = "0.post%%d" %% pieces["distance"]
807 if pieces["dirty"]:
808 rendered += ".dev0"
809 rendered += "+g%%s" %% pieces["short"]
810 return rendered
811
812
813 def render_pep440_old(pieces):
814 """TAG[.postDISTANCE[.dev0]] .
815
816 The ".dev0" means dirty.
817
818 Eexceptions:
819 1: no tags. 0.postDISTANCE[.dev0]
820 """
821 if pieces["closest-tag"]:
822 rendered = pieces["closest-tag"]
823 if pieces["distance"] or pieces["dirty"]:
824 rendered += ".post%%d" %% pieces["distance"]
825 if pieces["dirty"]:
826 rendered += ".dev0"
827 else:
828 # exception #1
829 rendered = "0.post%%d" %% pieces["distance"]
830 if pieces["dirty"]:
831 rendered += ".dev0"
832 return rendered
833
834
835 def render_git_describe(pieces):
836 """TAG[-DISTANCE-gHEX][-dirty].
837
838 Like 'git describe --tags --dirty --always'.
839
840 Exceptions:
841 1: no tags. HEX[-dirty] (note: no 'g' prefix)
842 """
843 if pieces["closest-tag"]:
844 rendered = pieces["closest-tag"]
845 if pieces["distance"]:
846 rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
847 else:
848 # exception #1
849 rendered = pieces["short"]
850 if pieces["dirty"]:
851 rendered += "-dirty"
852 return rendered
853
854
855 def render_git_describe_long(pieces):
856 """TAG-DISTANCE-gHEX[-dirty].
857
858 Like 'git describe --tags --dirty --always -long'.
859 The distance/hash is unconditional.
860
861 Exceptions:
862 1: no tags. HEX[-dirty] (note: no 'g' prefix)
863 """
864 if pieces["closest-tag"]:
865 rendered = pieces["closest-tag"]
866 rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
867 else:
868 # exception #1
869 rendered = pieces["short"]
870 if pieces["dirty"]:
871 rendered += "-dirty"
872 return rendered
873
874
875 def render(pieces, style):
876 """Render the given version pieces into the requested style."""
877 if pieces["error"]:
878 return {"version": "unknown",
879 "full-revisionid": pieces.get("long"),
880 "dirty": None,
881 "error": pieces["error"],
882 "date": None}
883
884 if not style or style == "default":
885 style = "pep440" # the default
886
887 if style == "pep440":
888 rendered = render_pep440(pieces)
889 elif style == "pep440-pre":
890 rendered = render_pep440_pre(pieces)
891 elif style == "pep440-post":
892 rendered = render_pep440_post(pieces)
893 elif style == "pep440-old":
894 rendered = render_pep440_old(pieces)
895 elif style == "git-describe":
896 rendered = render_git_describe(pieces)
897 elif style == "git-describe-long":
898 rendered = render_git_describe_long(pieces)
899 else:
900 raise ValueError("unknown style '%%s'" %% style)
901
902 return {"version": rendered, "full-revisionid": pieces["long"],
903 "dirty": pieces["dirty"], "error": None,
904 "date": pieces.get("date")}
905
906
907 def get_versions():
908 """Get version information or return default if unable to do so."""
909 # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
910 # __file__, we can work backwards from there to the root. Some
911 # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
912 # case we can only use expanded keywords.
913
914 cfg = get_config()
915 verbose = cfg.verbose
916
917 try:
918 return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
919 verbose)
920 except NotThisMethod:
921 pass
922
923 try:
924 root = os.path.realpath(__file__)
925 # versionfile_source is the relative path from the top of the source
926 # tree (where the .git directory might live) to this file. Invert
927 # this to find the root from __file__.
928 for i in cfg.versionfile_source.split('/'):
929 root = os.path.dirname(root)
930 except NameError:
931 return {"version": "0+unknown", "full-revisionid": None,
932 "dirty": None,
933 "error": "unable to find root of source tree",
934 "date": None}
935
936 try:
937 pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
938 return render(pieces, cfg.style)
939 except NotThisMethod:
940 pass
941
942 try:
943 if cfg.parentdir_prefix:
944 return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
945 except NotThisMethod:
946 pass
947
948 return {"version": "0+unknown", "full-revisionid": None,
949 "dirty": None,
950 "error": "unable to compute version", "date": None}
951 '''
952
953
954 @register_vcs_handler("git", "get_keywords")
955 def git_get_keywords(versionfile_abs):
956 """Extract version information from the given file."""
957 # the code embedded in _version.py can just fetch the value of these
958 # keywords. When used from setup.py, we don't want to import _version.py,
959 # so we do it with a regexp instead. This function is not used from
960 # _version.py.
961 keywords = {}
962 try:
963 f = open(versionfile_abs, "r")
964 for line in f.readlines():
965 if line.strip().startswith("git_refnames ="):
966 mo = re.search(r'=\s*"(.*)"', line)
967 if mo:
968 keywords["refnames"] = mo.group(1)
969 if line.strip().startswith("git_full ="):
970 mo = re.search(r'=\s*"(.*)"', line)
971 if mo:
972 keywords["full"] = mo.group(1)
973 if line.strip().startswith("git_date ="):
974 mo = re.search(r'=\s*"(.*)"', line)
975 if mo:
976 keywords["date"] = mo.group(1)
977 f.close()
978 except EnvironmentError:
979 pass
980 return keywords
981
982
983 @register_vcs_handler("git", "keywords")
984 def git_versions_from_keywords(keywords, tag_prefix, verbose):
985 """Get version information from git keywords."""
986 if not keywords:
987 raise NotThisMethod("no keywords at all, weird")
988 date = keywords.get("date")
989 if date is not None:
990 # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
991 # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
992 # -like" string, which we must then edit to make compliant), because
993 # it's been around since git-1.5.3, and it's too difficult to
994 # discover which version we're using, or to work around using an
995 # older one.
996 date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
997 refnames = keywords["refnames"].strip()
998 if refnames.startswith("$Format"):
999 if verbose:
1000 print("keywords are unexpanded, not using")
1001 raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
1002 refs = set([r.strip() for r in refnames.strip("()").split(",")])
1003 # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
1004 # just "foo-1.0". If we see a "tag: " prefix, prefer those.
1005 TAG = "tag: "
1006 tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
1007 if not tags:
1008 # Either we're using git < 1.8.3, or there really are no tags. We use
1009 # a heuristic: assume all version tags have a digit. The old git %d
1010 # expansion behaves like git log --decorate=short and strips out the
1011 # refs/heads/ and refs/tags/ prefixes that would let us distinguish
1012 # between branches and tags. By ignoring refnames without digits, we
1013 # filter out many common branch names like "release" and
1014 # "stabilization", as well as "HEAD" and "master".
1015 tags = set([r for r in refs if re.search(r"\d", r)])
1016 if verbose:
1017 print("discarding '%s', no digits" % ",".join(refs - tags))
1018 if verbose:
1019 print("likely tags: %s" % ",".join(sorted(tags)))
1020 for ref in sorted(tags):
1021 # sorting will prefer e.g. "2.0" over "2.0rc1"
1022 if ref.startswith(tag_prefix):
1023 r = ref[len(tag_prefix) :]
1024 if verbose:
1025 print("picking %s" % r)
1026 return {
1027 "version": r,
1028 "full-revisionid": keywords["full"].strip(),
1029 "dirty": False,
1030 "error": None,
1031 "date": date,
1032 }
1033 # no suitable tags, so version is "0+unknown", but full hex is still there
1034 if verbose:
1035 print("no suitable tags, using unknown + full revision id")
1036 return {
1037 "version": "0+unknown",
1038 "full-revisionid": keywords["full"].strip(),
1039 "dirty": False,
1040 "error": "no suitable tags",
1041 "date": None,
1042 }
1043
1044
1045 @register_vcs_handler("git", "pieces_from_vcs")
1046 def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
1047 """Get version from 'git describe' in the root of the source tree.
1048
1049 This only gets called if the git-archive 'subst' keywords were *not*
1050 expanded, and _version.py hasn't already been rewritten with a short
1051 version string, meaning we're inside a checked out source tree.
1052 """
1053 GITS = ["git"]
1054 if sys.platform == "win32":
1055 GITS = ["git.cmd", "git.exe"]
1056
1057 out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
1058 if rc != 0:
1059 if verbose:
1060 print("Directory %s not under git control" % root)
1061 raise NotThisMethod("'git rev-parse --git-dir' returned error")
1062
1063 # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
1064 # if there isn't one, this yields HEX[-dirty] (no NUM)
1065 describe_out, rc = run_command(
1066 GITS,
1067 [
1068 "describe",
1069 "--tags",
1070 "--dirty",
1071 "--always",
1072 "--long",
1073 "--match",
1074 "%s*" % tag_prefix,
1075 ],
1076 cwd=root,
1077 )
1078 # --long was added in git-1.5.5
1079 if describe_out is None:
1080 raise NotThisMethod("'git describe' failed")
1081 describe_out = describe_out.strip()
1082 full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
1083 if full_out is None:
1084 raise NotThisMethod("'git rev-parse' failed")
1085 full_out = full_out.strip()
1086
1087 pieces = {}
1088 pieces["long"] = full_out
1089 pieces["short"] = full_out[:7] # maybe improved later
1090 pieces["error"] = None
1091
1092 # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
1093 # TAG might have hyphens.
1094 git_describe = describe_out
1095
1096 # look for -dirty suffix
1097 dirty = git_describe.endswith("-dirty")
1098 pieces["dirty"] = dirty
1099 if dirty:
1100 git_describe = git_describe[: git_describe.rindex("-dirty")]
1101
1102 # now we have TAG-NUM-gHEX or HEX
1103
1104 if "-" in git_describe:
1105 # TAG-NUM-gHEX
1106 mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
1107 if not mo:
1108 # unparseable. Maybe git-describe is misbehaving?
1109 pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
1110 return pieces
1111
1112 # tag
1113 full_tag = mo.group(1)
1114 if not full_tag.startswith(tag_prefix):
1115 if verbose:
1116 fmt = "tag '%s' doesn't start with prefix '%s'"
1117 print(fmt % (full_tag, tag_prefix))
1118 pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
1119 full_tag,
1120 tag_prefix,
1121 )
1122 return pieces
1123 pieces["closest-tag"] = full_tag[len(tag_prefix) :]
1124
1125 # distance: number of commits since tag
1126 pieces["distance"] = int(mo.group(2))
1127
1128 # commit: short hex revision ID
1129 pieces["short"] = mo.group(3)
1130
1131 else:
1132 # HEX: no tags
1133 pieces["closest-tag"] = None
1134 count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
1135 pieces["distance"] = int(count_out) # total number of commits
1136
1137 # commit date: see ISO-8601 comment in git_versions_from_keywords()
1138 date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
1139 0
1140 ].strip()
1141 pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
1142
1143 return pieces
1144
1145
1146 def do_vcs_install(manifest_in, versionfile_source, ipy):
1147 """Git-specific installation logic for Versioneer.
1148
1149 For Git, this means creating/changing .gitattributes to mark _version.py
1150 for export-subst keyword substitution.
1151 """
1152 GITS = ["git"]
1153 if sys.platform == "win32":
1154 GITS = ["git.cmd", "git.exe"]
1155 files = [manifest_in, versionfile_source]
1156 if ipy:
1157 files.append(ipy)
1158 try:
1159 me = __file__
1160 if me.endswith(".pyc") or me.endswith(".pyo"):
1161 me = os.path.splitext(me)[0] + ".py"
1162 versioneer_file = os.path.relpath(me)
1163 except NameError:
1164 versioneer_file = "versioneer.py"
1165 files.append(versioneer_file)
1166 present = False
1167 try:
1168 f = open(".gitattributes", "r")
1169 for line in f.readlines():
1170 if line.strip().startswith(versionfile_source):
1171 if "export-subst" in line.strip().split()[1:]:
1172 present = True
1173 f.close()
1174 except EnvironmentError:
1175 pass
1176 if not present:
1177 f = open(".gitattributes", "a+")
1178 f.write("%s export-subst\n" % versionfile_source)
1179 f.close()
1180 files.append(".gitattributes")
1181 run_command(GITS, ["add", "--"] + files)
1182
1183
1184 def versions_from_parentdir(parentdir_prefix, root, verbose):
1185 """Try to determine the version from the parent directory name.
1186
1187 Source tarballs conventionally unpack into a directory that includes both
1188 the project name and a version string. We will also support searching up
1189 two directory levels for an appropriately named parent directory
1190 """
1191 rootdirs = []
1192
1193 for i in range(3):
1194 dirname = os.path.basename(root)
1195 if dirname.startswith(parentdir_prefix):
1196 return {
1197 "version": dirname[len(parentdir_prefix) :],
1198 "full-revisionid": None,
1199 "dirty": False,
1200 "error": None,
1201 "date": None,
1202 }
1203 else:
1204 rootdirs.append(root)
1205 root = os.path.dirname(root) # up a level
1206
1207 if verbose:
1208 print(
1209 "Tried directories %s but none started with prefix %s"
1210 % (str(rootdirs), parentdir_prefix)
1211 )
1212 raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
1213
1214
1215 SHORT_VERSION_PY = """
1216 # This file was generated by 'versioneer.py' (0.18) from
1217 # revision-control system data, or from the parent directory name of an
1218 # unpacked source archive. Distribution tarballs contain a pre-generated copy
1219 # of this file.
1220
1221 import json
1222
1223 version_json = '''
1224 %s
1225 ''' # END VERSION_JSON
1226
1227
1228 def get_versions():
1229 return json.loads(version_json)
1230 """
1231
1232
1233 def versions_from_file(filename):
1234 """Try to determine the version from _version.py if present."""
1235 try:
1236 with open(filename) as f:
1237 contents = f.read()
1238 except EnvironmentError:
1239 raise NotThisMethod("unable to read _version.py")
1240 mo = re.search(
1241 r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S
1242 )
1243 if not mo:
1244 mo = re.search(
1245 r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S
1246 )
1247 if not mo:
1248 raise NotThisMethod("no version_json in _version.py")
1249 return json.loads(mo.group(1))
1250
1251
1252 def write_to_version_file(filename, versions):
1253 """Write the given version number to the given _version.py file."""
1254 os.unlink(filename)
1255 contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": "))
1256 with open(filename, "w") as f:
1257 f.write(SHORT_VERSION_PY % contents)
1258
1259 print("set %s to '%s'" % (filename, versions["version"]))
1260
1261
1262 def plus_or_dot(pieces):
1263 """Return a + if we don't already have one, else return a ."""
1264 if "+" in pieces.get("closest-tag", ""):
1265 return "."
1266 return "+"
1267
1268
1269 def render_pep440(pieces):
1270 """Build up version string, with post-release "local version identifier".
1271
1272 Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
1273 get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
1274
1275 Exceptions:
1276 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
1277 """
1278 if pieces["closest-tag"]:
1279 rendered = pieces["closest-tag"]
1280 if pieces["distance"] or pieces["dirty"]:
1281 rendered += plus_or_dot(pieces)
1282 rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
1283 if pieces["dirty"]:
1284 rendered += ".dirty"
1285 else:
1286 # exception #1
1287 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
1288 if pieces["dirty"]:
1289 rendered += ".dirty"
1290 return rendered
1291
1292
1293 def render_pep440_pre(pieces):
1294 """TAG[.post.devDISTANCE] -- No -dirty.
1295
1296 Exceptions:
1297 1: no tags. 0.post.devDISTANCE
1298 """
1299 if pieces["closest-tag"]:
1300 rendered = pieces["closest-tag"]
1301 if pieces["distance"]:
1302 rendered += ".post.dev%d" % pieces["distance"]
1303 else:
1304 # exception #1
1305 rendered = "0.post.dev%d" % pieces["distance"]
1306 return rendered
1307
1308
1309 def render_pep440_post(pieces):
1310 """TAG[.postDISTANCE[.dev0]+gHEX] .
1311
1312 The ".dev0" means dirty. Note that .dev0 sorts backwards
1313 (a dirty tree will appear "older" than the corresponding clean one),
1314 but you shouldn't be releasing software with -dirty anyways.
1315
1316 Exceptions:
1317 1: no tags. 0.postDISTANCE[.dev0]
1318 """
1319 if pieces["closest-tag"]:
1320 rendered = pieces["closest-tag"]
1321 if pieces["distance"] or pieces["dirty"]:
1322 rendered += ".post%d" % pieces["distance"]
1323 if pieces["dirty"]:
1324 rendered += ".dev0"
1325 rendered += plus_or_dot(pieces)
1326 rendered += "g%s" % pieces["short"]
1327 else:
1328 # exception #1
1329 rendered = "0.post%d" % pieces["distance"]
1330 if pieces["dirty"]:
1331 rendered += ".dev0"
1332 rendered += "+g%s" % pieces["short"]
1333 return rendered
1334
1335
1336 def render_pep440_old(pieces):
1337 """TAG[.postDISTANCE[.dev0]] .
1338
1339 The ".dev0" means dirty.
1340
1341 Eexceptions:
1342 1: no tags. 0.postDISTANCE[.dev0]
1343 """
1344 if pieces["closest-tag"]:
1345 rendered = pieces["closest-tag"]
1346 if pieces["distance"] or pieces["dirty"]:
1347 rendered += ".post%d" % pieces["distance"]
1348 if pieces["dirty"]:
1349 rendered += ".dev0"
1350 else:
1351 # exception #1
1352 rendered = "0.post%d" % pieces["distance"]
1353 if pieces["dirty"]:
1354 rendered += ".dev0"
1355 return rendered
1356
1357
1358 def render_git_describe(pieces):
1359 """TAG[-DISTANCE-gHEX][-dirty].
1360
1361 Like 'git describe --tags --dirty --always'.
1362
1363 Exceptions:
1364 1: no tags. HEX[-dirty] (note: no 'g' prefix)
1365 """
1366 if pieces["closest-tag"]:
1367 rendered = pieces["closest-tag"]
1368 if pieces["distance"]:
1369 rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
1370 else:
1371 # exception #1
1372 rendered = pieces["short"]
1373 if pieces["dirty"]:
1374 rendered += "-dirty"
1375 return rendered
1376
1377
1378 def render_git_describe_long(pieces):
1379 """TAG-DISTANCE-gHEX[-dirty].
1380
1381 Like 'git describe --tags --dirty --always -long'.
1382 The distance/hash is unconditional.
1383
1384 Exceptions:
1385 1: no tags. HEX[-dirty] (note: no 'g' prefix)
1386 """
1387 if pieces["closest-tag"]:
1388 rendered = pieces["closest-tag"]
1389 rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
1390 else:
1391 # exception #1
1392 rendered = pieces["short"]
1393 if pieces["dirty"]:
1394 rendered += "-dirty"
1395 return rendered
1396
1397
1398 def render(pieces, style):
1399 """Render the given version pieces into the requested style."""
1400 if pieces["error"]:
1401 return {
1402 "version": "unknown",
1403 "full-revisionid": pieces.get("long"),
1404 "dirty": None,
1405 "error": pieces["error"],
1406 "date": None,
1407 }
1408
1409 if not style or style == "default":
1410 style = "pep440" # the default
1411
1412 if style == "pep440":
1413 rendered = render_pep440(pieces)
1414 elif style == "pep440-pre":
1415 rendered = render_pep440_pre(pieces)
1416 elif style == "pep440-post":
1417 rendered = render_pep440_post(pieces)
1418 elif style == "pep440-old":
1419 rendered = render_pep440_old(pieces)
1420 elif style == "git-describe":
1421 rendered = render_git_describe(pieces)
1422 elif style == "git-describe-long":
1423 rendered = render_git_describe_long(pieces)
1424 else:
1425 raise ValueError("unknown style '%s'" % style)
1426
1427 return {
1428 "version": rendered,
1429 "full-revisionid": pieces["long"],
1430 "dirty": pieces["dirty"],
1431 "error": None,
1432 "date": pieces.get("date"),
1433 }
1434
1435
1436 class VersioneerBadRootError(Exception):
1437 """The project root directory is unknown or missing key files."""
1438
1439
1440 def get_versions(verbose=False):
1441 """Get the project version from whatever source is available.
1442
1443 Returns dict with two keys: 'version' and 'full'.
1444 """
1445 if "versioneer" in sys.modules:
1446 # see the discussion in cmdclass.py:get_cmdclass()
1447 del sys.modules["versioneer"]
1448
1449 root = get_root()
1450 cfg = get_config_from_root(root)
1451
1452 assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
1453 handlers = HANDLERS.get(cfg.VCS)
1454 assert handlers, "unrecognized VCS '%s'" % cfg.VCS
1455 verbose = verbose or cfg.verbose
1456 assert (
1457 cfg.versionfile_source is not None
1458 ), "please set versioneer.versionfile_source"
1459 assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
1460
1461 versionfile_abs = os.path.join(root, cfg.versionfile_source)
1462
1463 # extract version from first of: _version.py, VCS command (e.g. 'git
1464 # describe'), parentdir. This is meant to work for developers using a
1465 # source checkout, for users of a tarball created by 'setup.py sdist',
1466 # and for users of a tarball/zipball created by 'git archive' or github's
1467 # download-from-tag feature or the equivalent in other VCSes.
1468
1469 get_keywords_f = handlers.get("get_keywords")
1470 from_keywords_f = handlers.get("keywords")
1471 if get_keywords_f and from_keywords_f:
1472 try:
1473 keywords = get_keywords_f(versionfile_abs)
1474 ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
1475 if verbose:
1476 print("got version from expanded keyword %s" % ver)
1477 return ver
1478 except NotThisMethod:
1479 pass
1480
1481 try:
1482 ver = versions_from_file(versionfile_abs)
1483 if verbose:
1484 print("got version from file %s %s" % (versionfile_abs, ver))
1485 return ver
1486 except NotThisMethod:
1487 pass
1488
1489 from_vcs_f = handlers.get("pieces_from_vcs")
1490 if from_vcs_f:
1491 try:
1492 pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
1493 ver = render(pieces, cfg.style)
1494 if verbose:
1495 print("got version from VCS %s" % ver)
1496 return ver
1497 except NotThisMethod:
1498 pass
1499
1500 try:
1501 if cfg.parentdir_prefix:
1502 ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
1503 if verbose:
1504 print("got version from parentdir %s" % ver)
1505 return ver
1506 except NotThisMethod:
1507 pass
1508
1509 if verbose:
1510 print("unable to compute version")
1511
1512 return {
1513 "version": "0+unknown",
1514 "full-revisionid": None,
1515 "dirty": None,
1516 "error": "unable to compute version",
1517 "date": None,
1518 }
1519
1520
1521 def get_version():
1522 """Get the short version string for this project."""
1523 return get_versions()["version"]
1524
1525
1526 def get_cmdclass():
1527 """Get the custom setuptools/distutils subclasses used by Versioneer."""
1528 if "versioneer" in sys.modules:
1529 del sys.modules["versioneer"]
1530 # this fixes the "python setup.py develop" case (also 'install' and
1531 # 'easy_install .'), in which subdependencies of the main project are
1532 # built (using setup.py bdist_egg) in the same python process. Assume
1533 # a main project A and a dependency B, which use different versions
1534 # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
1535 # sys.modules by the time B's setup.py is executed, causing B to run
1536 # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
1537 # sandbox that restores sys.modules to it's pre-build state, so the
1538 # parent is protected against the child's "import versioneer". By
1539 # removing ourselves from sys.modules here, before the child build
1540 # happens, we protect the child from the parent's versioneer too.
1541 # Also see https://github.com/warner/python-versioneer/issues/52
1542
1543 cmds = {}
1544
1545 # we add "version" to both distutils and setuptools
1546 from distutils.core import Command
1547
1548 class cmd_version(Command):
1549 description = "report generated version string"
1550 user_options = []
1551 boolean_options = []
1552
1553 def initialize_options(self):
1554 pass
1555
1556 def finalize_options(self):
1557 pass
1558
1559 def run(self):
1560 vers = get_versions(verbose=True)
1561 print("Version: %s" % vers["version"])
1562 print(" full-revisionid: %s" % vers.get("full-revisionid"))
1563 print(" dirty: %s" % vers.get("dirty"))
1564 print(" date: %s" % vers.get("date"))
1565 if vers["error"]:
1566 print(" error: %s" % vers["error"])
1567
1568 cmds["version"] = cmd_version
1569
1570 # we override "build_py" in both distutils and setuptools
1571 #
1572 # most invocation pathways end up running build_py:
1573 # distutils/build -> build_py
1574 # distutils/install -> distutils/build ->..
1575 # setuptools/bdist_wheel -> distutils/install ->..
1576 # setuptools/bdist_egg -> distutils/install_lib -> build_py
1577 # setuptools/install -> bdist_egg ->..
1578 # setuptools/develop -> ?
1579 # pip install:
1580 # copies source tree to a tempdir before running egg_info/etc
1581 # if .git isn't copied too, 'git describe' will fail
1582 # then does setup.py bdist_wheel, or sometimes setup.py install
1583 # setup.py egg_info -> ?
1584
1585 # we override different "build_py" commands for both environments
1586 if "setuptools" in sys.modules:
1587 from setuptools.command.build_py import build_py as _build_py
1588 else:
1589 from distutils.command.build_py import build_py as _build_py
1590
1591 class cmd_build_py(_build_py):
1592 def run(self):
1593 root = get_root()
1594 cfg = get_config_from_root(root)
1595 versions = get_versions()
1596 _build_py.run(self)
1597 # now locate _version.py in the new build/ directory and replace
1598 # it with an updated value
1599 if cfg.versionfile_build:
1600 target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
1601 print("UPDATING %s" % target_versionfile)
1602 write_to_version_file(target_versionfile, versions)
1603
1604 cmds["build_py"] = cmd_build_py
1605
1606 if "cx_Freeze" in sys.modules: # cx_freeze enabled?
1607 from cx_Freeze.dist import build_exe as _build_exe
1608
1609 # nczeczulin reports that py2exe won't like the pep440-style string
1610 # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
1611 # setup(console=[{
1612 # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
1613 # "product_version": versioneer.get_version(),
1614 # ...
1615
1616 class cmd_build_exe(_build_exe):
1617 def run(self):
1618 root = get_root()
1619 cfg = get_config_from_root(root)
1620 versions = get_versions()
1621 target_versionfile = cfg.versionfile_source
1622 print("UPDATING %s" % target_versionfile)
1623 write_to_version_file(target_versionfile, versions)
1624
1625 _build_exe.run(self)
1626 os.unlink(target_versionfile)
1627 with open(cfg.versionfile_source, "w") as f:
1628 LONG = LONG_VERSION_PY[cfg.VCS]
1629 f.write(
1630 LONG
1631 % {
1632 "DOLLAR": "$",
1633 "STYLE": cfg.style,
1634 "TAG_PREFIX": cfg.tag_prefix,
1635 "PARENTDIR_PREFIX": cfg.parentdir_prefix,
1636 "VERSIONFILE_SOURCE": cfg.versionfile_source,
1637 }
1638 )
1639
1640 cmds["build_exe"] = cmd_build_exe
1641 del cmds["build_py"]
1642
1643 if "py2exe" in sys.modules: # py2exe enabled?
1644 try:
1645 from py2exe.distutils_buildexe import py2exe as _py2exe # py3
1646 except ImportError:
1647 from py2exe.build_exe import py2exe as _py2exe # py2
1648
1649 class cmd_py2exe(_py2exe):
1650 def run(self):
1651 root = get_root()
1652 cfg = get_config_from_root(root)
1653 versions = get_versions()
1654 target_versionfile = cfg.versionfile_source
1655 print("UPDATING %s" % target_versionfile)
1656 write_to_version_file(target_versionfile, versions)
1657
1658 _py2exe.run(self)
1659 os.unlink(target_versionfile)
1660 with open(cfg.versionfile_source, "w") as f:
1661 LONG = LONG_VERSION_PY[cfg.VCS]
1662 f.write(
1663 LONG
1664 % {
1665 "DOLLAR": "$",
1666 "STYLE": cfg.style,
1667 "TAG_PREFIX": cfg.tag_prefix,
1668 "PARENTDIR_PREFIX": cfg.parentdir_prefix,
1669 "VERSIONFILE_SOURCE": cfg.versionfile_source,
1670 }
1671 )
1672
1673 cmds["py2exe"] = cmd_py2exe
1674
1675 # we override different "sdist" commands for both environments
1676 if "setuptools" in sys.modules:
1677 from setuptools.command.sdist import sdist as _sdist
1678 else:
1679 from distutils.command.sdist import sdist as _sdist
1680
1681 class cmd_sdist(_sdist):
1682 def run(self):
1683 versions = get_versions()
1684 self._versioneer_generated_versions = versions
1685 # unless we update this, the command will keep using the old
1686 # version
1687 self.distribution.metadata.version = versions["version"]
1688 return _sdist.run(self)
1689
1690 def make_release_tree(self, base_dir, files):
1691 root = get_root()
1692 cfg = get_config_from_root(root)
1693 _sdist.make_release_tree(self, base_dir, files)
1694 # now locate _version.py in the new base_dir directory
1695 # (remembering that it may be a hardlink) and replace it with an
1696 # updated value
1697 target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
1698 print("UPDATING %s" % target_versionfile)
1699 write_to_version_file(
1700 target_versionfile, self._versioneer_generated_versions
1701 )
1702
1703 cmds["sdist"] = cmd_sdist
1704
1705 return cmds
1706
1707
1708 CONFIG_ERROR = """
1709 setup.cfg is missing the necessary Versioneer configuration. You need
1710 a section like:
1711
1712 [versioneer]
1713 VCS = git
1714 style = pep440
1715 versionfile_source = src/myproject/_version.py
1716 versionfile_build = myproject/_version.py
1717 tag_prefix =
1718 parentdir_prefix = myproject-
1719
1720 You will also need to edit your setup.py to use the results:
1721
1722 import versioneer
1723 setup(version=versioneer.get_version(),
1724 cmdclass=versioneer.get_cmdclass(), ...)
1725
1726 Please read the docstring in ./versioneer.py for configuration instructions,
1727 edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
1728 """
1729
1730 SAMPLE_CONFIG = """
1731 # See the docstring in versioneer.py for instructions. Note that you must
1732 # re-run 'versioneer.py setup' after changing this section, and commit the
1733 # resulting files.
1734
1735 [versioneer]
1736 #VCS = git
1737 #style = pep440
1738 #versionfile_source =
1739 #versionfile_build =
1740 #tag_prefix =
1741 #parentdir_prefix =
1742
1743 """
1744
1745 INIT_PY_SNIPPET = """
1746 from ._version import get_versions
1747 __version__ = get_versions()['version']
1748 del get_versions
1749 """
1750
1751
1752 def do_setup():
1753 """Main VCS-independent setup function for installing Versioneer."""
1754 root = get_root()
1755 try:
1756 cfg = get_config_from_root(root)
1757 except (
1758 EnvironmentError,
1759 configparser.NoSectionError,
1760 configparser.NoOptionError,
1761 ) as e:
1762 if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
1763 print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
1764 with open(os.path.join(root, "setup.cfg"), "a") as f:
1765 f.write(SAMPLE_CONFIG)
1766 print(CONFIG_ERROR, file=sys.stderr)
1767 return 1
1768
1769 print(" creating %s" % cfg.versionfile_source)
1770 with open(cfg.versionfile_source, "w") as f:
1771 LONG = LONG_VERSION_PY[cfg.VCS]
1772 f.write(
1773 LONG
1774 % {
1775 "DOLLAR": "$",
1776 "STYLE": cfg.style,
1777 "TAG_PREFIX": cfg.tag_prefix,
1778 "PARENTDIR_PREFIX": cfg.parentdir_prefix,
1779 "VERSIONFILE_SOURCE": cfg.versionfile_source,
1780 }
1781 )
1782
1783 ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
1784 if os.path.exists(ipy):
1785 try:
1786 with open(ipy, "r") as f:
1787 old = f.read()
1788 except EnvironmentError:
1789 old = ""
1790 if INIT_PY_SNIPPET not in old:
1791 print(" appending to %s" % ipy)
1792 with open(ipy, "a") as f:
1793 f.write(INIT_PY_SNIPPET)
1794 else:
1795 print(" %s unmodified" % ipy)
1796 else:
1797 print(" %s doesn't exist, ok" % ipy)
1798 ipy = None
1799
1800 # Make sure both the top-level "versioneer.py" and versionfile_source
1801 # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
1802 # they'll be copied into source distributions. Pip won't be able to
1803 # install the package without this.
1804 manifest_in = os.path.join(root, "MANIFEST.in")
1805 simple_includes = set()
1806 try:
1807 with open(manifest_in, "r") as f:
1808 for line in f:
1809 if line.startswith("include "):
1810 for include in line.split()[1:]:
1811 simple_includes.add(include)
1812 except EnvironmentError:
1813 pass
1814 # That doesn't cover everything MANIFEST.in can do
1815 # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
1816 # it might give some false negatives. Appending redundant 'include'
1817 # lines is safe, though.
1818 if "versioneer.py" not in simple_includes:
1819 print(" appending 'versioneer.py' to MANIFEST.in")
1820 with open(manifest_in, "a") as f:
1821 f.write("include versioneer.py\n")
1822 else:
1823 print(" 'versioneer.py' already in MANIFEST.in")
1824 if cfg.versionfile_source not in simple_includes:
1825 print(
1826 " appending versionfile_source ('%s') to MANIFEST.in"
1827 % cfg.versionfile_source
1828 )
1829 with open(manifest_in, "a") as f:
1830 f.write("include %s\n" % cfg.versionfile_source)
1831 else:
1832 print(" versionfile_source already in MANIFEST.in")
1833
1834 # Make VCS-specific changes. For git, this means creating/changing
1835 # .gitattributes to mark _version.py for export-subst keyword
1836 # substitution.
1837 do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
1838 return 0
1839
1840
1841 def scan_setup_py():
1842 """Validate the contents of setup.py against Versioneer's expectations."""
1843 found = set()
1844 setters = False
1845 errors = 0
1846 with open("setup.py", "r") as f:
1847 for line in f.readlines():
1848 if "import versioneer" in line:
1849 found.add("import")
1850 if "versioneer.get_cmdclass()" in line:
1851 found.add("cmdclass")
1852 if "versioneer.get_version()" in line:
1853 found.add("get_version")
1854 if "versioneer.VCS" in line:
1855 setters = True
1856 if "versioneer.versionfile_source" in line:
1857 setters = True
1858 if len(found) != 3:
1859 print("")
1860 print("Your setup.py appears to be missing some important items")
1861 print("(but I might be wrong). Please make sure it has something")
1862 print("roughly like the following:")
1863 print("")
1864 print(" import versioneer")
1865 print(" setup( version=versioneer.get_version(),")
1866 print(" cmdclass=versioneer.get_cmdclass(), ...)")
1867 print("")
1868 errors += 1
1869 if setters:
1870 print("You should remove lines like 'versioneer.VCS = ' and")
1871 print("'versioneer.versionfile_source = ' . This configuration")
1872 print("now lives in setup.cfg, and should be removed from setup.py")
1873 print("")
1874 errors += 1
1875 return errors
1876
1877
1878 if __name__ == "__main__":
1879 cmd = sys.argv[1]
1880 if cmd == "setup":
1881 errors = do_setup()
1882 errors += scan_setup_py()
1883 if errors:
1884 sys.exit(1)