Commit f7aa29e55f817226395308401e9a2c633f53568a - fsspec

New upstream version 0.6.0 eamanu 4 years ago

72 changed file(s) with 11736 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all

+17

-0

.coveragerc less more

	0	[run]
	1	omit =
	2	/test_.py
	3	fsspec/_version.py
	4	source =
	5	fsspec
	6
	7	[report]
	8	# Regexes for lines to exclude from consideration
	9	exclude_lines =
	10	pragma: no cover
	11
	12	raise AssertionError
	13	raise NotImplementedError
	14	pass
	15
	16	ignore_errors = True

-0

.gitattributes less more

fsspec/_version.py export-subst

+106

-0

.gitignore less more

	0	# Dask
	1	dask-worker-space
	2
	3	# Byte-compiled / optimized / DLL files
	4	__pycache__/
	5	*.py[cod]
	6	*$py.class
	7
	8	# C extensions
	9	*.so
	10
	11	# Distribution / packaging
	12	.Python
	13	env/
	14	build/
	15	develop-eggs/
	16	dist/
	17	downloads/
	18	eggs/
	19	.eggs/
	20	lib/
	21	lib64/
	22	parts/
	23	sdist/
	24	var/
	25	wheels/
	26	*.egg-info/
	27	.installed.cfg
	28	*.egg
	29	pip-wheel-metadata/
	30
	31	# PyInstaller
	32	# Usually these files are written by a python script from a template
	33	# before PyInstaller builds the exe, so as to inject date/other infos into it.
	34	*.manifest
	35	*.spec
	36
	37	# Installer logs
	38	pip-log.txt
	39	pip-delete-this-directory.txt
	40
	41	# Unit test / coverage reports
	42	htmlcov/
	43	.tox/
	44	.coverage
	45	.coverage.*
	46	.cache
	47	nosetests.xml
	48	coverage.xml
	49	*.cover
	50	.hypothesis/
	51
	52	# Translations
	53	*.mo
	54	*.pot
	55
	56	# Django stuff:
	57	*.log
	58	local_settings.py
	59
	60	# Flask stuff:
	61	instance/
	62	.webassets-cache
	63
	64	# Scrapy stuff:
	65	.scrapy
	66
	67	# Sphinx documentation
	68	docs/_build/
	69
	70	# PyBuilder
	71	target/
	72
	73	# Jupyter Notebook
	74	.ipynb_checkpoints
	75
	76	# pyenv
	77	.python-version
	78
	79	# celery beat schedule file
	80	celerybeat-schedule
	81
	82	# SageMath parsed files
	83	*.sage.py
	84
	85	# dotenv
	86	.env
	87
	88	# virtualenv
	89	.venv
	90	venv/
	91	ENV/
	92	.idea/
	93
	94	# Spyder project settings
	95	.spyderproject
	96	.spyproject
	97
	98	# Rope project settings
	99	.ropeproject
	100
	101	# mkdocs documentation
	102	/site
	103
	104	# mypy
	105	.mypy_cache/

+19

-0

.pre-commit-config.yaml less more

	0	exclude: >
	1	(?x)^(
	2	\.tox/.*
	3	)$
	4	default_language_version:
	5	python: python3.7
	6	repos:
	7	- repo: local
	8	hooks:
	9	- id: black
	10	name: black
	11	entry: black
	12	language: python
	13	require_serial: true
	14	types: [python]
	15	- repo: https://github.com/pre-commit/pre-commit-hooks
	16	rev: v2.3.0
	17	hooks:
	18	- id: flake8

+23

-0

.travis.yml less more

	0	sudo: required
	1	dist: xenial
	2	os:
	3	- linux
	4	services:
	5	- docker
	6
	7	language: generic
	8	env:
	9	- TOXENV=py35
	10	- TOXENV=py36
	11	- TOXENV=py37
	12	- TOXENV=coverage
	13	- TOXENV=lint
	14	- TOXENV=s3fs
	15	- TOXENV=gcsfs
	16	install:
	17	- source ci/install.sh
	18	script:
	19	- tox -v
	20
	21	notifications:
	22	email: false

+29

-0

LICENSE less more

	0	BSD 3-Clause License
	1
	2	Copyright (c) 2018, Martin Durant
	3	All rights reserved.
	4
	5	Redistribution and use in source and binary forms, with or without
	6	modification, are permitted provided that the following conditions are met:
	7
	8	* Redistributions of source code must retain the above copyright notice, this
	9	list of conditions and the following disclaimer.
	10
	11	* Redistributions in binary form must reproduce the above copyright notice,
	12	this list of conditions and the following disclaimer in the documentation
	13	and/or other materials provided with the distribution.
	14
	15	* Neither the name of the copyright holder nor the names of its
	16	contributors may be used to endorse or promote products derived from
	17	this software without specific prior written permission.
	18
	19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	20	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	21	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	22	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	23	FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	24	DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	25	SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	26	CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	27	OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	28	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-0

MANIFEST.in less more

	0	include versioneer.py
	1	include fsspec/_version.py
	2
	3	include LICENSE
	4	include README.rst
	5	include requirements.txt

+73

-0

README.md less more

	0	# filesystem_spec
	1
	2	[![Build Status](https://travis-ci.org/intake/filesystem_spec.svg?branch=master)](https://travis-ci.org/martindurant/filesystem_spec)
	3	[![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
	4
	5	A specification for pythonic filesystems.
	6
	7	## Install
	8
	9	```bash
	10	pip install fsspec
	11	```
	12	or
	13	```bash
	14	conda install -c conda-forge fsspec
	15	```
	16
	17	## Purpose
	18
	19	To produce a template or specification for a file-system interface, that specific implementations should follow,
	20	so that applications making use of them can rely on a common behaviour and not have to worry about the specific
	21	internal implementation decisions with any given backend. Many such implementations are included in this package,
	22	or in sister projects such as `s3fs` and `gcsfs`.
	23
	24	In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE
	25	mounting of the file-system implementation may be available for all implementations "for free".
	26
	27	## Documentation
	28
	29	Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest)
	30
	31	## Develop
	32
	33	fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and
	34	[tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test
	35	environments. First, install conda with tox and tox-conda in a base environment
	36	(eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be
	37	used to configure a development environment and run tests.
	38
	39	First, setup a development conda environment via `tox -e dev`. This will
	40	install fspec dependencies, test & dev tools, and install fsspec in develop
	41	mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`.
	42
	43	### Testing
	44
	45	Tests can be run directly in the activated dev environment via `pytest fsspec`.
	46
	47	The full fsspec test suite can be run via `tox`, which will setup and execute
	48	tests against multiple dependency versions in isolated environment. Run `tox
	49	-av` to list available test environments, select environments via `tox -e <env>`.
	50
	51	The full fsspec suite requires a system-level docker, docker-compose, and fuse
	52	installation. See `ci/install.sh` for a detailed installation example.
	53
	54	### Code Formatting
	55
	56	fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure
	57	a consistent code format throughout the project. ``black`` is automatically
	58	installed in the tox dev env, activated via `conda activate .tox/dev`.
	59
	60	Then, run `black fsspec` from the root of the filesystem_spec repository to
	61	auto-format your code. Additionally, many editors have plugins that will apply
	62	`black` as you edit files.
	63
	64	Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to
	65	automatically run `black` when you make a git commit. ``black`` is automatically
	66	installed in the tox dev env, activated via `conda activate .tox/dev`.
	67
	68	Then, run `pre-commit install --install-hooks` from the root of the
	69	filesystem_spec repository to setup pre-commit hooks. `black` will now be run
	70	before you commit, reformatting any changed files. You can format without
	71	committing via `pre-commit run` or skip these checks with `git commit
	72	--no-verify`.

+23

-0

ci/install.sh less more

	0	#!/usr/bin/env bash
	1	# https://docs.travis-ci.com/user/docker/#using-docker-compose
	2
	3
	4	DOCKER_COMPOSE_VERSION=${DOCKER_COMPOSE_VERSION:-1.23.2}
	5
	6	# Install docker
	7	curl -fsSL https://download.docker.com/linux/ubuntu/gpg \| sudo apt-key add -
	8	sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
	9	sudo apt-get update
	10	sudo apt-get -y -o Dpkg::Options::="--force-confnew" install docker-ce
	11
	12	# Update docker-compose
	13	sudo rm /usr/local/bin/docker-compose
	14	curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose
	15	chmod +x docker-compose
	16	sudo mv docker-compose /usr/local/bin
	17
	18	# install FUSE
	19	sudo apt-get install libfuse-dev
	20
	21	# install conda
	22	source $(dirname $BASH_SOURCE)/install_conda.sh

-0

ci/install_conda.sh less more

	0	#!/usr/bin/env bash
	1
	2	# Install conda
	3	wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
	4	bash miniconda.sh -b -p $HOME/miniconda
	5	export PATH="$HOME/miniconda/bin:$PATH"
	6	conda config --set always_yes yes --set changeps1 no
	7	conda update conda
	8	conda install -c conda-forge tox tox-conda

+20

-0

docs/Makefile less more

	0	# Minimal makefile for Sphinx documentation
	1	#
	2
	3	# You can set these variables from the command line.
	4	SPHINXOPTS =
	5	SPHINXBUILD = sphinx-build
	6	SPHINXPROJ = fsspec
	7	SOURCEDIR = source
	8	BUILDDIR = build
	9
	10	# Put it first so that "make" without argument is like "make help".
	11	help:
	12	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
	13
	14	.PHONY: help Makefile
	15
	16	# Catch-all target: route all unknown targets to Sphinx using the new
	17	# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
	18	%: Makefile
	19	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

+12

-0

docs/README.md less more

	0	# Building Documentation
	1
	2	A basic python environment with packages listed in `./requirements.txt` is
	3	required to build the docs, see ``environment.yml``.
	4
	5	To make HTML documentation:
	6
	7	```bash
	8	make html
	9	```
	10
	11	Outputs to `build/html/index.html`

-0

docs/environment.yml less more

	0	name: fsspec
	1	channels:
	2	- defaults
	3	- conda-forge
	4	dependencies:
	5	- python=3.6
	6	- paramiko
	7	- requests
	8	- numpydoc

+36

-0

docs/make.bat less more

	0	@ECHO OFF
	1
	2	pushd %~dp0
	3
	4	REM Command file for Sphinx documentation
	5
	6	if "%SPHINXBUILD%" == "" (
	7	set SPHINXBUILD=sphinx-build
	8	)
	9	set SOURCEDIR=source
	10	set BUILDDIR=build
	11	set SPHINXPROJ=fsspec
	12
	13	if "%1" == "" goto help
	14
	15	%SPHINXBUILD% >NUL 2>NUL
	16	if errorlevel 9009 (
	17	echo.
	18	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	19	echo.installed, then set the SPHINXBUILD environment variable to point
	20	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	21	echo.may add the Sphinx directory to PATH.
	22	echo.
	23	echo.If you don't have Sphinx installed, grab it from
	24	echo.http://sphinx-doc.org/
	25	exit /b 1
	26	)
	27
	28	%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
	29	goto end
	30
	31	:help
	32	%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
	33
	34	:end
	35	popd

+121

-0

docs/source/api.rst less more

	0	API Reference
	1	=============
	2
	3	.. currentmodule:: fsspec
	4
	5	User Functions
	6	--------------
	7
	8	.. autosummary::
	9	fsspec.open_files
	10	fsspec.open
	11	fsspec.filesystem
	12	fsspec.get_filesystem_class
	13	fsspec.get_mapper
	14	fsspec.fuse.run
	15
	16	.. autofunction:: fsspec.open_files
	17	.. autofunction:: fsspec.open
	18	.. autofunction:: fsspec.filesystem
	19	.. autofunction:: fsspec.get_filesystem_class
	20	.. autofunction:: fsspec.get_mapper
	21	.. autofunction:: fsspec.fuse.run
	22
	23	Base Classes
	24	------------
	25
	26	.. autosummary::
	27	fsspec.spec.AbstractFileSystem
	28	fsspec.spec.Transaction
	29	fsspec.spec.AbstractBufferedFile
	30	fsspec.FSMap
	31	fsspec.core.OpenFile
	32	fsspec.core.BaseCache
	33
	34	.. autoclass:: fsspec.spec.AbstractFileSystem
	35
	36	.. autoclass:: fsspec.spec.Transaction
	37	:members:
	38
	39	.. autoclass:: fsspec.spec.AbstractBufferedFile
	40	:members:
	41
	42	.. autoclass:: fsspec.FSMap
	43	:members:
	44
	45	.. autoclass:: fsspec.core.OpenFile
	46	:members:
	47
	48	.. autoclass:: fsspec.core.BaseCache
	49	:members:
	50
	51
	52	.. _implementations:
	53
	54	Built-in Implementations
	55	------------------------
	56
	57	.. autosummary::
	58	fsspec.implementations.ftp.FTPFileSystem
	59	fsspec.implementations.hdfs.PyArrowHDFS
	60	fsspec.implementations.http.HTTPFileSystem
	61	fsspec.implementations.local.LocalFileSystem
	62	fsspec.implementations.memory.MemoryFileSystem
	63	fsspec.implementations.sftp.SFTPFileSystem
	64	fsspec.implementations.webhdfs.WebHDFS
	65	fsspec.implementations.zip.ZipFileSystem
	66	fsspec.implementations.cached.CachingFileSystem
	67	fsspec.implementations.cached.WholeFileCacheFileSystem
	68
	69	.. autoclass:: fsspec.implementations.ftp.FTPFileSystem
	70	:members: __init__
	71
	72	.. autoclass:: fsspec.implementations.hdfs.PyArrowHDFS
	73	:members: __init__
	74
	75	.. autoclass:: fsspec.implementations.http.HTTPFileSystem
	76	:members: __init__
	77
	78	.. autoclass:: fsspec.implementations.local.LocalFileSystem
	79	:members:
	80
	81	.. autoclass:: fsspec.implementations.memory.MemoryFileSystem
	82	:members: __init__
	83
	84	.. autoclass:: fsspec.implementations.sftp.SFTPFileSystem
	85	:members: __init__
	86
	87	.. autoclass:: fsspec.implementations.webhdfs.WebHDFS
	88	:members: __init__
	89
	90	.. autoclass:: fsspec.implementations.zip.ZipFileSystem
	91	:members: __init__
	92
	93	.. autoclass:: fsspec.implementations.cached.CachingFileSystem
	94	:members: __init__
	95
	96	.. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem
	97
	98	.. _readbuffering:
	99
	100	Read Buffering
	101	--------------
	102
	103	.. autosummary::
	104
	105	fsspec.caching.ReadAheadCache
	106	fsspec.caching.BytesCache
	107	fsspec.caching.MMapCache
	108	fsspec.caching.BlockCache
	109
	110	.. autoclass:: fsspec.caching.ReadAheadCache
	111	:members:
	112
	113	.. autoclass:: fsspec.caching.BytesCache
	114	:members:
	115
	116	.. autoclass:: fsspec.caching.MMapCache
	117	:members:
	118
	119	.. autoclass:: fsspec.caching.BlockCache
	120	:members:

+23

-0

docs/source/changelog.rst less more

	0	Changelog
	1	=========
	2
	3	Version 0.6.0
	4	-------------
	5
	6	* Fixed issues with filesystem instance caching. This was causing authorization errors
	7	in downstream libraries like ``gcsfs`` and ``s3fs`` in multi-threaded code (:pr:`155`, :pr:`181`)
	8	* Changed the default file caching strategy to :class:`fsspec.caching.ReadAheadCache` (:pr:`193`)
	9	* Moved file caches to the new ``fsspec.caching`` module. They're still available from
	10	their old location in ``fsspec.core``, but we recommend using the new location for new code (:pr:`195`)
	11	* Added a new file caching strategy, :class:`fsspec.caching.BlockCache` for fetching and caching
	12	file reads in blocks (:pr:`191`).
	13	* Fixed equality checks for file system instance to return ``False`` when compared to objects
	14	other than file systems (:pr:`192`)
	15	* Fixed a bug in :meth:`fsspec.FSMap.keys` returning a generator, which was consumed upon iteration (:pr:`189`).
	16	* Removed the magic addition of aliases in ``AbstractFileSystem.__init__``. Now alias methods are always
	17	present (:pr:`177`)
	18	* Deprecated passing ``trim`` to :class:`fsspec.spec.AbstractBufferedFile`. Pass it in ``storage_options`` instead (:pr:`188`)
	19	* Improved handling of requests for :class:`fsspec.implementations.http.HTTPFileSystem` when the
	20	HTTP server responds with an (incorrect) content-length of 0 (:pr:`163`)
	21	* Added a ``detail=True`` parameter to :meth:`fsspec.spec.AbstractFileSystem.ls` (:pr:`168`)
	22	* Fixed handling of UNC/DFS paths (:issue:`154`)⏎

+178

-0

docs/source/conf.py less more

	0	# -- coding: utf-8 --
	1	#
	2	# fsspec documentation build configuration file, created by
	3	# sphinx-quickstart on Mon Jan 15 18:11:02 2018.
	4	#
	5	# This file is execfile()d with the current directory set to its
	6	# containing dir.
	7	#
	8	# Note that not all possible configuration values are present in this
	9	# autogenerated file.
	10	#
	11	# All configuration values have a default; values that are commented out
	12	# serve to show the default.
	13
	14	# If extensions (or modules to document with autodoc) are in another directory,
	15	# add these directories to sys.path here. If the directory is relative to the
	16	# documentation root, use os.path.abspath to make it absolute, like shown here.
	17	#
	18	import os
	19	import sys
	20
	21	sys.path.insert(0, os.path.abspath("../.."))
	22
	23
	24	# -- General configuration ------------------------------------------------
	25
	26	# If your documentation needs a minimal Sphinx version, state it here.
	27	#
	28	# needs_sphinx = '1.0'
	29
	30	# Add any Sphinx extension module names here, as strings. They can be
	31	# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
	32	# ones.
	33	extensions = [
	34	"sphinx.ext.autodoc",
	35	"sphinx.ext.viewcode",
	36	"sphinx.ext.autosummary",
	37	"sphinx.ext.extlinks",
	38	"numpydoc",
	39	]
	40
	41	# Add any paths that contain templates here, relative to this directory.
	42	templates_path = ["_templates"]
	43
	44	# The suffix(es) of source filenames.
	45	# You can specify multiple suffix as a list of string:
	46	#
	47	# source_suffix = ['.rst', '.md']
	48	source_suffix = ".rst"
	49
	50	# The master toctree document.
	51	master_doc = "index"
	52
	53	# General information about the project.
	54	project = "fsspec"
	55	copyright = "2018, Martin Durant"
	56	author = "Martin Durant"
	57
	58	# The version info for the project you're documenting, acts as replacement for
	59	# \|version\| and \|release\|, also used in various other places throughout the
	60	# built documents.
	61	#
	62	# The short X.Y version.
	63	import fsspec
	64
	65	version = fsspec.__version__
	66	# The full version, including alpha/beta/rc tags.
	67	release = fsspec.__version__
	68
	69	# The language for content autogenerated by Sphinx. Refer to documentation
	70	# for a list of supported languages.
	71	#
	72	# This is also used if you do content translation via gettext catalogs.
	73	# Usually you set "language" from the command line for these cases.
	74	language = None
	75
	76	# List of patterns, relative to source directory, that match files and
	77	# directories to ignore when looking for source files.
	78	# This patterns also effect to html_static_path and html_extra_path
	79	exclude_patterns = []
	80
	81	# The name of the Pygments (syntax highlighting) style to use.
	82	pygments_style = "sphinx"
	83
	84	# If true, `todo` and `todoList` produce output, else they produce nothing.
	85	todo_include_todos = False
	86
	87
	88	# -- Options for HTML output ----------------------------------------------
	89
	90	# The theme to use for HTML and HTML Help pages. See the documentation for
	91	# a list of builtin themes.
	92	#
	93	html_theme = "sphinx_rtd_theme"
	94
	95	# Theme options are theme-specific and customize the look and feel of a theme
	96	# further. For a list of options available for each theme, see the
	97	# documentation.
	98	#
	99	# html_theme_options = {}
	100
	101	# Add any paths that contain custom static files (such as style sheets) here,
	102	# relative to this directory. They are copied after the builtin static files,
	103	# so a file named "default.css" will overwrite the builtin "default.css".
	104	html_static_path = []
	105
	106	# Custom sidebar templates, must be a dictionary that maps document names
	107	# to template names.
	108	#
	109	# This is required for the alabaster theme
	110	# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
	111	html_sidebars = {
	112	"**": [
	113	"relations.html", # needs 'show_related': True theme option to display
	114	"searchbox.html",
	115	]
	116	}
	117
	118
	119	# -- Options for HTMLHelp output ------------------------------------------
	120
	121	# Output file base name for HTML help builder.
	122	htmlhelp_basename = "fsspecdoc"
	123
	124
	125	# -- Options for LaTeX output ---------------------------------------------
	126
	127	latex_elements = {
	128	# The paper size ('letterpaper' or 'a4paper').
	129	#
	130	# 'papersize': 'letterpaper',
	131	# The font size ('10pt', '11pt' or '12pt').
	132	#
	133	# 'pointsize': '10pt',
	134	# Additional stuff for the LaTeX preamble.
	135	#
	136	# 'preamble': '',
	137	# Latex figure (float) alignment
	138	#
	139	# 'figure_align': 'htbp',
	140	}
	141
	142	# Grouping the document tree into LaTeX files. List of tuples
	143	# (source start file, target name, title,
	144	# author, documentclass [howto, manual, or own class]).
	145	latex_documents = [
	146	(master_doc, "fsspec.tex", "fsspec Documentation", "Joseph Crail", "manual")
	147	]
	148
	149
	150	# -- Options for manual page output ---------------------------------------
	151
	152	# One entry per manual page. List of tuples
	153	# (source start file, name, description, authors, manual section).
	154	man_pages = [(master_doc, "fsspec", "fsspec Documentation", [author], 1)]
	155
	156
	157	# -- Options for Texinfo output -------------------------------------------
	158
	159	# Grouping the document tree into Texinfo files. List of tuples
	160	# (source start file, target name, title, author,
	161	# dir menu entry, description, category)
	162	texinfo_documents = [
	163	(
	164	master_doc,
	165	"fsspec",
	166	"fsspec Documentation",
	167	author,
	168	"fsspec",
	169	"One line description of project.",
	170	"Miscellaneous",
	171	)
	172	]
	173
	174	extlinks = {
	175	"issue": ("https://github.com/intake/filesystem_spec/issues/%s", "GH#"),
	176	"pr": ("https://github.com/intake/filesystem_spec/pull/%s", "GH#"),
	177	}

+219

-0

docs/source/features.rst less more

	0	Features of fsspec
	1	==================
	2
	3	Consistent API to many different storage backends. The general API and functionality were
	4	proven with the projects `s3fs`_ and `gcsfs`_ (along with `hdfs3`_ and `adlfs`_), within the
	5	context of Dask and independently. These have been tried and tested by many users and shown their
	6	usefulness over some years. ``fsspec`` aims to build on these and unify their models, as well
	7	as extract out file-system handling code from Dask which does not so comfortably fit within a
	8	library designed for task-graph creation and their scheduling.
	9
	10	.. _s3fs: https://s3fs.readthedocs.io/en/latest/
	11	.. _gcsfs: https://gcsfs.readthedocs.io/en/latest/
	12	.. _hdfs3: https://hdfs3.readthedocs.io/en/latest/
	13	.. _adlfs: https://azure-datalake-store.readthedocs.io/en/latest/
	14
	15	Here follows a brief description of some features of note of ``fsspec`` that promide to make
	16	it an interesting project beyond some other file-system abstractions
	17
	18	Serialisability
	19	---------------
	20
	21	Coming out of the Dask stable, it was an important design decision that file-system instances
	22	be serialisable, so that they could be created in one process (e.g., the client) and used in
	23	other processes (typically the workers). These other processes may even be on other machines,
	24	so in many cases they would need to be able to re-establish credentials, ideally without passing
	25	sensitive tokens in the pickled binary data.
	26
	27	``fsspec`` instances, generally speaking, abide by these rules, do not include locks, files and other
	28	thread-local material, and where possible, use local credentials (such as a token file)
	29	for re-establishing sessions upon de-serialisation. (While making use of cached instances, where
	30	they exist, see below).
	31
	32	``OpenFile`` instances
	33	----------------------
	34
	35	The :func:`fsspec.core.OpenFile` class provides a convenient way to prescribe the manner to
	36	open some file (local,
	37	remote, in a compressed store, etc.) which is portable, and ca also apply any compression and
	38	text-mode to the file. These instances are also serialisable, because the do not contain any open
	39	files.
	40
	41	The way to work with ``OpenFile`` s is to isolate interaction with in a ``with`` context. It is
	42	the initiation of the context which actually does the work of creating file-like instances.
	43
	44	.. code-block:: python
	45
	46	of = fsspec.open(url, ...)
	47	# of is just a place-holder
	48	with of as f:
	49	# f is now a real file-like object holding resources
	50	f.read(...)
	51
	52	Random Access and Buffering
	53	---------------------------
	54
	55	The :func:`fsspec.spec.AbstractBufferedFile` class is provided as an easy way to build file-like
	56	interfaces to some service which is capable of providing blocks of bytes. This class is derived
	57	from in a number of the existing implementations. A subclass of ``AbstractBufferedFile`` provides
	58	random access for the underlying file-like data (without downloading the whole thing) and
	59	configurable read-ahead buffers to minimise the number of the read operations that need to be
	60	performed on the back-end storage.
	61
	62	This is also a critical feature in the big-data access model, where each sub-task of an operation
	63	may need on a small part of a file, and does not, therefore want to be forces into downloading the
	64	whole thing.
	65
	66	Transparent text-mode and compression
	67	-------------------------------------
	68
	69	As mentioned above, the ``OpenFile`` class allows for the opening of files on a binary store,
	70	which appear to be in text mode and/or allow for a compression/decompression layer between the
	71	caller and the back-end storage system. From the user's point of view, this is achieved simply
	72	by passing arguments to the :func:`fsspec.open_files` or :func:`fsspec.open` functions, and
	73	thereafter happens transparently.
	74
	75	Key-value stores
	76	----------------
	77
	78	File-systems are naturally like dict-like key-value mappings: each (string) path corresponds to some
	79	binary data on the storage back-end. For some use-cases, it is very convenient to be able to
	80	view some path within the file-system as a dict-like store, and the function :func:`fsspec.get_mapper`
	81	gives a one-stop way to return such an object. This has become useful, for example, in the
	82	context of the `zarr`_ project, which stores it array chunks in keys in any arbitrary mapping-like
	83	object.
	84
	85	.. code-block:: python
	86
	87	mapper = fsspec.get_mapper('protocol://server/path', args)
	88	list(mapper)
	89	mapper[k] = b'some data'
	90
	91	.. _zarr: https://zarr.readthedocs.io/en/stable/
	92
	93	PyArrow integration
	94	-------------------
	95
	96	`pyarrow`_ has its own internal idea of what a file-system is (``pyarrow.filesystem.FileSystem``),
	97	and some functions, particularly the loading of parquet, require that the target be compatible.
	98	As it happens, the design of the file-system interface in ``pyarrow`` is compatible with `fsspec`
	99	(this is not by accident). Therefore at import time, ``fsspec`` checks for the existence of
	100	``pyarrow``, and, if found, adds it to the superclasses of the spec base-class. In this manner,
	101	all ``fsspec``-derived file-systems are also pyarrow file-systems, and can be used by pyarrow
	102	functions.
	103
	104	.. _pyarrow: https://arrow.apache.org/docs/python/
	105
	106	Transactions
	107	------------
	108
	109	``fsspec`` supports transactions, during which writing to files on a remote store are deferred
	110	(typically put into a temporary location) until the transaction is over, whereupon the whole
	111	transaction is finalised in a semi-atomic way, and all the files are moved/committed to their
	112	final destination. The implementation of the details is file-system specific (and not all
	113	support it yet), but the idea is,
	114	that all files should get written or none, to mitigate against data corruption. The feature
	115	can be used like
	116
	117	.. code-block:: python
	118
	119	fs = fsspec.filesystem(...)
	120	with fs.transation:
	121	with fs.open('file1', 'wb') as f:
	122	f.write(b'some data')
	123	with fs.open('file2', 'wb') as f:
	124	f.write(b'more data')
	125
	126	Here, files 1 and 2 do not get moved to the target location until the transaction context finishes.
	127	If the context finishes due to an (uncaught) exception, then the files are discarded and the
	128	file target locations untouched.
	129
	130	The class :func:`fsspec.spec.Transaction` allows for fine-tuning of the operation, and every
	131	``fsspec`` instance has an instance of this as an attribute ``.transaction`` to give access.
	132
	133	Note that synchronising transactions across multiple instances, perhaps across a cluster,
	134	is a harder problem to solve, and the implementation described here is only part of the solution.
	135
	136	Mount anything with FUSE
	137	------------------------
	138
	139	Any path of any file-system can be mapped to a local directory using pyfuse and
	140	:func:`sspec.fuse.run`. This feature is experimental, but basic file listing with
	141	details, and read/write should generally be available to the extent that the
	142	remote file-system provides enough information. Naturally, if a file-system is read-only,
	143	then write operations will fail - but they will tend to fail late and with obscure
	144	error messages such as "bad address".
	145
	146	Some specific quirks of some file-systems may cause confusion for FUSE. For example,
	147	it is possible for a given path on s3 to be both a valid key (i.e., containing binary
	148	data, like a file) and a valid prefix (i.e., can be listed to find subkeys, like a
	149	directory). Since this breaks the assumptions of a normal file-system, it may not
	150	be possible to reach all paths on the remote.
	151
	152	Instance Caching
	153	----------------
	154
	155	In a file-system implementation class is marked as cachable (attribute ``.cachable``),
	156	then its instances will
	157	get stored in a class attribute, to enable quick look-up instead of needing to regenerate
	158	potentially expensive connections and sessions. They key in the cache is a tokenisation of
	159	the arguments to create the instance. The cache itself (attribute ``._cache``)
	160	is currently a simple dict, but could in the future be LRU, or something more complicated,
	161	to fine-tune instance lifetimes.
	162
	163	Since files can hold on to write caches and read buffers,
	164	the instance cache may cause excessive memory usage in some situations; but normally, files
	165	will get ``close``d, and the data discarded. Only when there is also an unfinalised transaction or
	166	captured traceback might this be anticipated becoming a problem.
	167
	168	File Buffering
	169	--------------
	170
	171	Most implementations create file objects which derive from ``fsspec.spec.AbstractBufferedFile``, and
	172	have many behaviours in common. These files offer buffering of both read and write operations, so that
	173	communication with the remote resource is limited. The size of the buffer is generally configured
	174	with the ``blocksize=`` kwargs at p[en time, although the implementation may have some minimum or
	175	maximum sizes that need to be respected.
	176
	177	For reading, a number of buffering schemes are available, listed in ``fsspec.caching.caches``
	178	(see :ref:`readbuffering`), or "none" for no buffering at all, e.g., for a simple read-ahead
	179	buffer, you can do
	180
	181	.. code-block:: python
	182
	183	fs = fsspec.filesystem(...)
	184	with fs.open(path, mode='rb', cache_type='readahead') as f:
	185	use_for_something(f)
	186
	187	Caching Files Locally
	188	---------------------
	189
	190	``fsspec`` allows you to access data on remote file systems, that is its purpose. However, such
	191	access can often be rather slow compared to local storage, so as well as buffering (see above), the
	192	option exists to cp[y files locally when you first access them, and thereafter to use the local data.
	193	This local cache of data might be temporary (i.e., attached to the process and discarded when the
	194	process ends) or at some specific location in your local storage.
	195
	196	Two mechanisms are provided, and both involve wrapping a `target` filesystem. The following example
	197	creates a file-based cache.
	198
	199	.. code-block:: python
	200
	201	fs = fsspec.filesystem("filecache", target_protocol='s3', target_options={'anon': True},
	202	cache_storage='/tmp/files/')
	203
	204	Each time you open a remote file on S3, it will first copy it to
	205	a local temporary directory, and then all further access will use the local file. Since we specify
	206	a particular local location, the files will persist and can be reused from future sessions, although
	207	you can also set policies to have cached files expire after some time, or to check the remote file system
	208	on each open, to see if the target file has changed since it was copied.
	209
	210	With the "blockcache" variant, data is downloaded block-wise: only the specific parts of the remote file
	211	which are accessed. This means that the local copy of the file might end up being much smaller than the
	212	remote one, if only certain parts of it are required.
	213
	214	Whereas "filecache" works for all file system implementations, and provides a real local file for other
	215	libraries to use, "blockcache" has restrictions: that you have a storage/OS combination which supports
	216	sparse files, that the backend implementation uses files which derive ``from AbstractBufferedFile``,
	217	and that the library you pass the resultant object to accepts generic python file-like objects. You
	218	should not mix block- and file-caches in the same directory.

+72

-0

docs/source/index.rst less more

	0	fsspec's: python filesystem interfaces
	1	======================================
	2
	3	Filesystem Spec is a project to unify various projects and classes to work with remote filesystems and
	4	file-system-like abstractions using a standard pythonic interface.
	5
	6
	7	.. _highlight:
	8
	9	Highlights
	10	----------
	11
	12	- based on s3fs and gcsfs
	13	- ``fsspec`` instances are serializable and can be passed between processes/machines
	14	- the ``OpenFiles`` file-like instances are also serializable
	15	- implementations provide random access, to enable only the part of a file required to be read; plus a template
	16	to base other file-like classes on
	17	- file access can use transparent compression and text-mode
	18	- any file-system directory can be viewed as a key-value/mapping store
	19	- if installed, all file-system classes also subclass from ``pyarrow.filesystem.FileSystem``, so
	20	can work with any arrow function expecting such an instance
	21	- writes can be transactional: stored in a temporary location and only moved to the final
	22	destination when the transaction is committed
	23	- FUSE: mount any path from any backend to a point on your file-system
	24	- cached instances tokenised on the instance parameters
	25
	26	These are described further in the :doc:`features` section.
	27
	28	Installation
	29	------------
	30
	31	pip install fsspec
	32
	33	or
	34
	35	conda install -c conda-forge fsspec
	36
	37	Implementations
	38	---------------
	39
	40	This repo contains several file-system implementations, see :ref:`implementations`. However,
	41	the external projects ``s3fs`` and ``gcsfs`` depend on ``fsspec`` and share the same behaviours.
	42	``Dask`` and ``Intake`` use ``fsspec`` internally for their IO needs.
	43
	44	The current list of known implementations can be found as follows
	45
	46	.. code-block:: python
	47
	48	from fsspec.registry import known_implementations
	49	known_implementations
	50
	51	These are only imported on request, which may fail if a required dependency is missing. The dictionary
	52	``fsspec.registry`` contains all imported implementations, and can be mutated by user code, if necessary.
	53
	54
	55	.. toctree::
	56	:maxdepth: 2
	57	:caption: Contents:
	58
	59	intro.rst
	60	usage.rst
	61	features.rst
	62	api.rst
	63	changelog.rst
	64
	65
	66	Indices and tables
	67	==================
	68
	69	* :ref:`genindex`
	70	* :ref:`modindex`
	71	* :ref:`search`

+94

-0

docs/source/intro.rst less more

	0	Introduction
	1	============
	2
	3	To get stuck into using the package, rather than reading about its philosophy and history, you can
	4	skip to :doc:`usage`.
	5
	6	Background
	7	----------
	8
	9	Python provides a standard interface for open files, so that alternate implementations of file-like object can
	10	work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries
	11	have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system
	12	which may be local, structured data store or some remote service.
	13
	14	This repository is intended to be a place to define a standard interface that such file-systems should adhere to,
	15	such that code using them should not have to know the details of the implementation in order to operate on any of
	16	a number of backends. With hope, the community can come together to
	17	define an interface that is the best for the highest number of users, and having the specification, makes developing
	18	other file-system implementations simpler.
	19
	20	History
	21	-------
	22
	23	I (Martin Durant) have been involved in building a number of remote-data file-system implementations, principally
	24	in the context of the `Dask`_ project. In particular, several are listed
	25	in `docs`_ with links to the specific repositories.
	26	With common authorship, there is much that is similar between the implementations, for example posix-like naming
	27	of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic
	28	URLs in order to select amongst them. However, some extra code was required in each case to adapt the peculiarities
	29	of each implementation with the generic usage that Dask demanded. People may find the
	30	`code`_ which parses URLs and creates file-system
	31	instances interesting.
	32
	33	.. _Dask: http://dask.pydata.org/en/latest/
	34	.. _docs: http://dask.pydata.org/en/latest/remote-data-services.html
	35	.. _code: https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266
	36
	37	At the same time, the Apache `Arrow`_ project was also concerned with a similar problem,
	38	particularly a common interface to local and HDFS files, for example the
	39	`hdfs`_ interface (which actually communicated with HDFS
	40	with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able
	41	to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a
	42	`conversation`_
	43	was started, and I invite all interested parties to continue the conversation in this location.
	44
	45	.. _Arrow: https://arrow.apache.org/
	46	.. _hdfs: https://arrow.apache.org/docs/python/filesystems.html
	47	.. _conversation: https://github.com/dask/dask/issues/2880
	48
	49	There is a good argument that this type of code has no place in Dask, which is concerned with making graphs
	50	representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful,
	51	and each has a user-base wider than just those that work via Dask.
	52
	53	Influences
	54	----------
	55
	56	The following places to consider, when choosing the definitions of how we would like the file-system specification
	57	to look:
	58
	59	- python's `os`_ module and its `path` namespace; also other file-connected
	60	functionality in the standard library
	61	- posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants
	62	- the existing implementations for the various backends (e.g.,
	63	`gcsfs`_ or Arrow's
	64	`hdfs`_)
	65	- `pyfilesystems`_, an attempt to do something similar, with a
	66	plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out
	67	validation code.
	68
	69	.. _os: https://docs.python.org/3/library/os.html
	70	.. _gcsfs: http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem
	71	.. _pyfilesystems: https://docs.pyfilesystem.org/en/latest/index.html
	72
	73	Not pyfilesystems?
	74	------------------
	75
	76	It might have been conceivable to reuse code in ``pyfilesystems``, which has an established interface and several
	77	implementations of its own. However, it supports none of the :ref:`highlight`, critical to
	78	cloud and parallel access, and would not be easy to
	79	coerce. Following on the success of ``s3fs`` and ``gcsfs``, and their use within Dask, it seemed best to
	80	have an interface as close to those as possible. See a
	81	`discussion`_ on the topic.
	82
	83	.. _discussion: https://github.com/intake/filesystem_spec/issues/5
	84
	85	Structure of the package
	86	------------------------
	87
	88	The best place to get a feel for the contents of ``fsspec`` is by looking through the :doc:`usage` and
	89	:doc:`api` sections. In addition, the source code will be interesting for those who wish to subclass and
	90	develop new file-system implementations. ``fsspec/spec.py`` contains the main abstract file-system class
	91	to derive from, ``AbstractFileSystem``.
	92
	93	.. _zarr: https://zarr.readthedocs.io

+81

-0

docs/source/usage.rst less more

	0	Usage
	1	=====
	2
	3	This is quick-start documentation to help people get familiar with the layout and functioning of ``fsspec``.
	4
	5	Instantiate a file-system
	6	-------------------------
	7
	8	``fsspec`` provides an abstract file-system interface as a template for other filesystems. In this context,
	9	"interface" means an API for working with files on the given file-system, which can mean files on some
	10	remote store, local files, files within some wrapper, or anything else that is capable of producing
	11	file-like objects.
	12
	13	Some concrete implementations are bundled with ``fsspec`` and others can be installed separately. They
	14	can be instantiated directly, or the `registry` can be used to find them.
	15
	16	Direct instantiation:
	17
	18	.. code-block:: python
	19
	20	from fsspec.implementations.local import LocalFileSystem
	21	fs = LocalFileSystem()
	22
	23	Look-up via registry:
	24
	25	.. code-block:: python
	26
	27	import fsspec
	28	fs = fsspec.filesystem('file')
	29
	30	Many filesystems also take extra parameters, some of which may be options - see :doc:`api`.
	31
	32	.. code-block:: python
	33
	34	import fsspec
	35	fs = fsspec.filesystem('ftp', host=host, port=port,
	36	username=user, password=pw)
	37
	38	Use a file-system
	39	-----------------
	40
	41	File-system instances offer a large number of methods for getting information about and manipulating files
	42	for the given back-end. Although some specific implementations may not offer all features (e.g., ``http``
	43	is read-only), generally all normal operations, such as ``ls``, ``rm``, should be expected to work (see the
	44	full list: :class:`fsspec.spec.AbstractFileSystem`).
	45	Note that this quick-start will prefer posix-style naming, but
	46	many common operations are aliased: ``cp()`` and ``copy()`` are identical, for instance.
	47	Functionality is generally chosen to be as close to the builtin ``os`` module's working for things like
	48	``glob`` as possible.
	49
	50	The ``open()`` method will return a file-like object which can be passed to any other library that expects
	51	to work with python files. These will normally be binary-mode only, but may implement internal buffering
	52	in order to limit the number of reads from a remote source. They respect the use of ``with`` contexts. If
	53	you have ``pandas`` installed, for example, you can do the following:
	54
	55	.. code-block:: python
	56
	57	with fs.open('https://raw.githubusercontent.com/dask/'
	58	'fastparquet/master/test-data/nation.csv') as f:
	59	df = pd.read_csv(f, sep='\|', header=None)
	60
	61	Higher-level
	62	------------
	63
	64	For many situations, the only function that will be needed is :func:`fsspec.open_files()`, which will return
	65	:class:`fsspec.core.OpenFile` instances created from a single URL and parameters to pass to the backend.
	66	This supports text-mode and compression on the fly, and the objects can be serialized for passing between
	67	processes or machines (so long as each has access to the same backend file-system). The protocol (i.e.,
	68	backend) is inferred from the URL passed, and glob characters are expanded in read mode (search for files)
	69	or write mode (create names). Critically, the file on the backend system is not actually opened until the
	70	``OpenFile`` instance is used in a ``with`` context. For the example above:
	71
	72	.. code-block:: python
	73
	74	of = fsspec.open('https://raw.githubusercontent.com/dask/'
	75	'fastparquet/master/test-data/nation.csv', mode='r')
	76	# files is a not-yet-open OpenFile object. The "with" context actually opens it
	77	with of as f:
	78	# now f is a text-mode file
	79	df = pd.read_csv(f, sep='\|', header=None)
	80

+24

-0

fsspec/__init__.py less more

	0	from ._version import get_versions
	1
	2	from .spec import AbstractFileSystem
	3	from .registry import get_filesystem_class, registry, filesystem
	4	from .mapping import FSMap, get_mapper
	5	from .core import open_files, get_fs_token_paths, open
	6	from . import caching
	7
	8	__version__ = get_versions()["version"]
	9	del get_versions
	10
	11
	12	__all__ = [
	13	"AbstractFileSystem",
	14	"FSMap",
	15	"filesystem",
	16	"get_filesystem_class",
	17	"get_fs_token_paths",
	18	"get_mapper",
	19	"open",
	20	"open_files",
	21	"registry",
	22	"caching",
	23	]

+556

-0

fsspec/_version.py less more

	0	# This file helps to compute a version number in source trees obtained from
	1	# git-archive tarball (such as those provided by githubs download-from-tag
	2	# feature). Distribution tarballs (built by setup.py sdist) and build
	3	# directories (produced by setup.py build) will contain a much shorter file
	4	# that just contains the computed version number.
	5
	6	# This file is released into the public domain. Generated by
	7	# versioneer-0.18 (https://github.com/warner/python-versioneer)
	8
	9	"""Git implementation of _version.py."""
	10
	11	import errno
	12	import os
	13	import re
	14	import subprocess
	15	import sys
	16
	17
	18	def get_keywords():
	19	"""Get the keywords needed to look up the version information."""
	20	# these strings will be replaced by git during git-archive.
	21	# setup.py/versioneer.py will grep for the variable names, so they must
	22	# each be defined on a line of their own. _version.py will just call
	23	# get_keywords().
	24	git_refnames = " (tag: 0.6.0)"
	25	git_full = "8b59dc8c2c035db5793102b9513c46e6a1bd4fb0"
	26	git_date = "2019-11-13 10:37:40 -0600"
	27	keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
	28	return keywords
	29
	30
	31	class VersioneerConfig:
	32	"""Container for Versioneer configuration parameters."""
	33
	34
	35	def get_config():
	36	"""Create, populate and return the VersioneerConfig() object."""
	37	# these strings are filled in when 'setup.py versioneer' creates
	38	# _version.py
	39	cfg = VersioneerConfig()
	40	cfg.VCS = "git"
	41	cfg.style = "pep440"
	42	cfg.tag_prefix = ""
	43	cfg.parentdir_prefix = "None"
	44	cfg.versionfile_source = "fsspec/_version.py"
	45	cfg.verbose = False
	46	return cfg
	47
	48
	49	class NotThisMethod(Exception):
	50	"""Exception raised if a method is not valid for the current scenario."""
	51
	52
	53	LONG_VERSION_PY = {}
	54	HANDLERS = {}
	55
	56
	57	def register_vcs_handler(vcs, method): # decorator
	58	"""Decorator to mark a method as the handler for a particular VCS."""
	59
	60	def decorate(f):
	61	"""Store f in HANDLERS[vcs][method]."""
	62	if vcs not in HANDLERS:
	63	HANDLERS[vcs] = {}
	64	HANDLERS[vcs][method] = f
	65	return f
	66
	67	return decorate
	68
	69
	70	def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
	71	"""Call the given command(s)."""
	72	assert isinstance(commands, list)
	73	p = None
	74	for c in commands:
	75	try:
	76	dispcmd = str([c] + args)
	77	# remember shell=False, so use git.cmd on windows, not just git
	78	p = subprocess.Popen(
	79	[c] + args,
	80	cwd=cwd,
	81	env=env,
	82	stdout=subprocess.PIPE,
	83	stderr=(subprocess.PIPE if hide_stderr else None),
	84	)
	85	break
	86	except EnvironmentError:
	87	e = sys.exc_info()[1]
	88	if e.errno == errno.ENOENT:
	89	continue
	90	if verbose:
	91	print("unable to run %s" % dispcmd)
	92	print(e)
	93	return None, None
	94	else:
	95	if verbose:
	96	print("unable to find command, tried %s" % (commands,))
	97	return None, None
	98	stdout = p.communicate()[0].strip()
	99	if sys.version_info[0] >= 3:
	100	stdout = stdout.decode()
	101	if p.returncode != 0:
	102	if verbose:
	103	print("unable to run %s (error)" % dispcmd)
	104	print("stdout was %s" % stdout)
	105	return None, p.returncode
	106	return stdout, p.returncode
	107
	108
	109	def versions_from_parentdir(parentdir_prefix, root, verbose):
	110	"""Try to determine the version from the parent directory name.
	111
	112	Source tarballs conventionally unpack into a directory that includes both
	113	the project name and a version string. We will also support searching up
	114	two directory levels for an appropriately named parent directory
	115	"""
	116	rootdirs = []
	117
	118	for i in range(3):
	119	dirname = os.path.basename(root)
	120	if dirname.startswith(parentdir_prefix):
	121	return {
	122	"version": dirname[len(parentdir_prefix) :],
	123	"full-revisionid": None,
	124	"dirty": False,
	125	"error": None,
	126	"date": None,
	127	}
	128	else:
	129	rootdirs.append(root)
	130	root = os.path.dirname(root) # up a level
	131
	132	if verbose:
	133	print(
	134	"Tried directories %s but none started with prefix %s"
	135	% (str(rootdirs), parentdir_prefix)
	136	)
	137	raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
	138
	139
	140	@register_vcs_handler("git", "get_keywords")
	141	def git_get_keywords(versionfile_abs):
	142	"""Extract version information from the given file."""
	143	# the code embedded in _version.py can just fetch the value of these
	144	# keywords. When used from setup.py, we don't want to import _version.py,
	145	# so we do it with a regexp instead. This function is not used from
	146	# _version.py.
	147	keywords = {}
	148	try:
	149	f = open(versionfile_abs, "r")
	150	for line in f.readlines():
	151	if line.strip().startswith("git_refnames ="):
	152	mo = re.search(r'=\s"(.)"', line)
	153	if mo:
	154	keywords["refnames"] = mo.group(1)
	155	if line.strip().startswith("git_full ="):
	156	mo = re.search(r'=\s"(.)"', line)
	157	if mo:
	158	keywords["full"] = mo.group(1)
	159	if line.strip().startswith("git_date ="):
	160	mo = re.search(r'=\s"(.)"', line)
	161	if mo:
	162	keywords["date"] = mo.group(1)
	163	f.close()
	164	except EnvironmentError:
	165	pass
	166	return keywords
	167
	168
	169	@register_vcs_handler("git", "keywords")
	170	def git_versions_from_keywords(keywords, tag_prefix, verbose):
	171	"""Get version information from git keywords."""
	172	if not keywords:
	173	raise NotThisMethod("no keywords at all, weird")
	174	date = keywords.get("date")
	175	if date is not None:
	176	# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
	177	# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
	178	# -like" string, which we must then edit to make compliant), because
	179	# it's been around since git-1.5.3, and it's too difficult to
	180	# discover which version we're using, or to work around using an
	181	# older one.
	182	date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
	183	refnames = keywords["refnames"].strip()
	184	if refnames.startswith("$Format"):
	185	if verbose:
	186	print("keywords are unexpanded, not using")
	187	raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
	188	refs = set([r.strip() for r in refnames.strip("()").split(",")])
	189	# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
	190	# just "foo-1.0". If we see a "tag: " prefix, prefer those.
	191	TAG = "tag: "
	192	tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
	193	if not tags:
	194	# Either we're using git < 1.8.3, or there really are no tags. We use
	195	# a heuristic: assume all version tags have a digit. The old git %d
	196	# expansion behaves like git log --decorate=short and strips out the
	197	# refs/heads/ and refs/tags/ prefixes that would let us distinguish
	198	# between branches and tags. By ignoring refnames without digits, we
	199	# filter out many common branch names like "release" and
	200	# "stabilization", as well as "HEAD" and "master".
	201	tags = set([r for r in refs if re.search(r"\d", r)])
	202	if verbose:
	203	print("discarding '%s', no digits" % ",".join(refs - tags))
	204	if verbose:
	205	print("likely tags: %s" % ",".join(sorted(tags)))
	206	for ref in sorted(tags):
	207	# sorting will prefer e.g. "2.0" over "2.0rc1"
	208	if ref.startswith(tag_prefix):
	209	r = ref[len(tag_prefix) :]
	210	if verbose:
	211	print("picking %s" % r)
	212	return {
	213	"version": r,
	214	"full-revisionid": keywords["full"].strip(),
	215	"dirty": False,
	216	"error": None,
	217	"date": date,
	218	}
	219	# no suitable tags, so version is "0+unknown", but full hex is still there
	220	if verbose:
	221	print("no suitable tags, using unknown + full revision id")
	222	return {
	223	"version": "0+unknown",
	224	"full-revisionid": keywords["full"].strip(),
	225	"dirty": False,
	226	"error": "no suitable tags",
	227	"date": None,
	228	}
	229
	230
	231	@register_vcs_handler("git", "pieces_from_vcs")
	232	def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
	233	"""Get version from 'git describe' in the root of the source tree.
	234
	235	This only gets called if the git-archive 'subst' keywords were not
	236	expanded, and _version.py hasn't already been rewritten with a short
	237	version string, meaning we're inside a checked out source tree.
	238	"""
	239	GITS = ["git"]
	240	if sys.platform == "win32":
	241	GITS = ["git.cmd", "git.exe"]
	242
	243	out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
	244	if rc != 0:
	245	if verbose:
	246	print("Directory %s not under git control" % root)
	247	raise NotThisMethod("'git rev-parse --git-dir' returned error")
	248
	249	# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
	250	# if there isn't one, this yields HEX[-dirty] (no NUM)
	251	describe_out, rc = run_command(
	252	GITS,
	253	[
	254	"describe",
	255	"--tags",
	256	"--dirty",
	257	"--always",
	258	"--long",
	259	"--match",
	260	"%s*" % tag_prefix,
	261	],
	262	cwd=root,
	263	)
	264	# --long was added in git-1.5.5
	265	if describe_out is None:
	266	raise NotThisMethod("'git describe' failed")
	267	describe_out = describe_out.strip()
	268	full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
	269	if full_out is None:
	270	raise NotThisMethod("'git rev-parse' failed")
	271	full_out = full_out.strip()
	272
	273	pieces = {}
	274	pieces["long"] = full_out
	275	pieces["short"] = full_out[:7] # maybe improved later
	276	pieces["error"] = None
	277
	278	# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
	279	# TAG might have hyphens.
	280	git_describe = describe_out
	281
	282	# look for -dirty suffix
	283	dirty = git_describe.endswith("-dirty")
	284	pieces["dirty"] = dirty
	285	if dirty:
	286	git_describe = git_describe[: git_describe.rindex("-dirty")]
	287
	288	# now we have TAG-NUM-gHEX or HEX
	289
	290	if "-" in git_describe:
	291	# TAG-NUM-gHEX
	292	mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
	293	if not mo:
	294	# unparseable. Maybe git-describe is misbehaving?
	295	pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
	296	return pieces
	297
	298	# tag
	299	full_tag = mo.group(1)
	300	if not full_tag.startswith(tag_prefix):
	301	if verbose:
	302	fmt = "tag '%s' doesn't start with prefix '%s'"
	303	print(fmt % (full_tag, tag_prefix))
	304	pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
	305	full_tag,
	306	tag_prefix,
	307	)
	308	return pieces
	309	pieces["closest-tag"] = full_tag[len(tag_prefix) :]
	310
	311	# distance: number of commits since tag
	312	pieces["distance"] = int(mo.group(2))
	313
	314	# commit: short hex revision ID
	315	pieces["short"] = mo.group(3)
	316
	317	else:
	318	# HEX: no tags
	319	pieces["closest-tag"] = None
	320	count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
	321	pieces["distance"] = int(count_out) # total number of commits
	322
	323	# commit date: see ISO-8601 comment in git_versions_from_keywords()
	324	date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
	325	0
	326	].strip()
	327	pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
	328
	329	return pieces
	330
	331
	332	def plus_or_dot(pieces):
	333	"""Return a + if we don't already have one, else return a ."""
	334	if "+" in pieces.get("closest-tag", ""):
	335	return "."
	336	return "+"
	337
	338
	339	def render_pep440(pieces):
	340	"""Build up version string, with post-release "local version identifier".
	341
	342	Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
	343	get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
	344
	345	Exceptions:
	346	1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
	347	"""
	348	if pieces["closest-tag"]:
	349	rendered = pieces["closest-tag"]
	350	if pieces["distance"] or pieces["dirty"]:
	351	rendered += plus_or_dot(pieces)
	352	rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
	353	if pieces["dirty"]:
	354	rendered += ".dirty"
	355	else:
	356	# exception #1
	357	rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
	358	if pieces["dirty"]:
	359	rendered += ".dirty"
	360	return rendered
	361
	362
	363	def render_pep440_pre(pieces):
	364	"""TAG[.post.devDISTANCE] -- No -dirty.
	365
	366	Exceptions:
	367	1: no tags. 0.post.devDISTANCE
	368	"""
	369	if pieces["closest-tag"]:
	370	rendered = pieces["closest-tag"]
	371	if pieces["distance"]:
	372	rendered += ".post.dev%d" % pieces["distance"]
	373	else:
	374	# exception #1
	375	rendered = "0.post.dev%d" % pieces["distance"]
	376	return rendered
	377
	378
	379	def render_pep440_post(pieces):
	380	"""TAG[.postDISTANCE[.dev0]+gHEX] .
	381
	382	The ".dev0" means dirty. Note that .dev0 sorts backwards
	383	(a dirty tree will appear "older" than the corresponding clean one),
	384	but you shouldn't be releasing software with -dirty anyways.
	385
	386	Exceptions:
	387	1: no tags. 0.postDISTANCE[.dev0]
	388	"""
	389	if pieces["closest-tag"]:
	390	rendered = pieces["closest-tag"]
	391	if pieces["distance"] or pieces["dirty"]:
	392	rendered += ".post%d" % pieces["distance"]
	393	if pieces["dirty"]:
	394	rendered += ".dev0"
	395	rendered += plus_or_dot(pieces)
	396	rendered += "g%s" % pieces["short"]
	397	else:
	398	# exception #1
	399	rendered = "0.post%d" % pieces["distance"]
	400	if pieces["dirty"]:
	401	rendered += ".dev0"
	402	rendered += "+g%s" % pieces["short"]
	403	return rendered
	404
	405
	406	def render_pep440_old(pieces):
	407	"""TAG[.postDISTANCE[.dev0]] .
	408
	409	The ".dev0" means dirty.
	410
	411	Eexceptions:
	412	1: no tags. 0.postDISTANCE[.dev0]
	413	"""
	414	if pieces["closest-tag"]:
	415	rendered = pieces["closest-tag"]
	416	if pieces["distance"] or pieces["dirty"]:
	417	rendered += ".post%d" % pieces["distance"]
	418	if pieces["dirty"]:
	419	rendered += ".dev0"
	420	else:
	421	# exception #1
	422	rendered = "0.post%d" % pieces["distance"]
	423	if pieces["dirty"]:
	424	rendered += ".dev0"
	425	return rendered
	426
	427
	428	def render_git_describe(pieces):
	429	"""TAG[-DISTANCE-gHEX][-dirty].
	430
	431	Like 'git describe --tags --dirty --always'.
	432
	433	Exceptions:
	434	1: no tags. HEX[-dirty] (note: no 'g' prefix)
	435	"""
	436	if pieces["closest-tag"]:
	437	rendered = pieces["closest-tag"]
	438	if pieces["distance"]:
	439	rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
	440	else:
	441	# exception #1
	442	rendered = pieces["short"]
	443	if pieces["dirty"]:
	444	rendered += "-dirty"
	445	return rendered
	446
	447
	448	def render_git_describe_long(pieces):
	449	"""TAG-DISTANCE-gHEX[-dirty].
	450
	451	Like 'git describe --tags --dirty --always -long'.
	452	The distance/hash is unconditional.
	453
	454	Exceptions:
	455	1: no tags. HEX[-dirty] (note: no 'g' prefix)
	456	"""
	457	if pieces["closest-tag"]:
	458	rendered = pieces["closest-tag"]
	459	rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
	460	else:
	461	# exception #1
	462	rendered = pieces["short"]
	463	if pieces["dirty"]:
	464	rendered += "-dirty"
	465	return rendered
	466
	467
	468	def render(pieces, style):
	469	"""Render the given version pieces into the requested style."""
	470	if pieces["error"]:
	471	return {
	472	"version": "unknown",
	473	"full-revisionid": pieces.get("long"),
	474	"dirty": None,
	475	"error": pieces["error"],
	476	"date": None,
	477	}
	478
	479	if not style or style == "default":
	480	style = "pep440" # the default
	481
	482	if style == "pep440":
	483	rendered = render_pep440(pieces)
	484	elif style == "pep440-pre":
	485	rendered = render_pep440_pre(pieces)
	486	elif style == "pep440-post":
	487	rendered = render_pep440_post(pieces)
	488	elif style == "pep440-old":
	489	rendered = render_pep440_old(pieces)
	490	elif style == "git-describe":
	491	rendered = render_git_describe(pieces)
	492	elif style == "git-describe-long":
	493	rendered = render_git_describe_long(pieces)
	494	else:
	495	raise ValueError("unknown style '%s'" % style)
	496
	497	return {
	498	"version": rendered,
	499	"full-revisionid": pieces["long"],
	500	"dirty": pieces["dirty"],
	501	"error": None,
	502	"date": pieces.get("date"),
	503	}
	504
	505
	506	def get_versions():
	507	"""Get version information or return default if unable to do so."""
	508	# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
	509	# __file__, we can work backwards from there to the root. Some
	510	# py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
	511	# case we can only use expanded keywords.
	512
	513	cfg = get_config()
	514	verbose = cfg.verbose
	515
	516	try:
	517	return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
	518	except NotThisMethod:
	519	pass
	520
	521	try:
	522	root = os.path.realpath(__file__)
	523	# versionfile_source is the relative path from the top of the source
	524	# tree (where the .git directory might live) to this file. Invert
	525	# this to find the root from __file__.
	526	for i in cfg.versionfile_source.split("/"):
	527	root = os.path.dirname(root)
	528	except NameError:
	529	return {
	530	"version": "0+unknown",
	531	"full-revisionid": None,
	532	"dirty": None,
	533	"error": "unable to find root of source tree",
	534	"date": None,
	535	}
	536
	537	try:
	538	pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
	539	return render(pieces, cfg.style)
	540	except NotThisMethod:
	541	pass
	542
	543	try:
	544	if cfg.parentdir_prefix:
	545	return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
	546	except NotThisMethod:
	547	pass
	548
	549	return {
	550	"version": "0+unknown",
	551	"full-revisionid": None,
	552	"dirty": None,
	553	"error": "unable to compute version",
	554	"date": None,
	555	}

+379

-0

fsspec/caching.py less more

	0	import os
	1	import io
	2	import functools
	3	import logging
	4	import math
	5
	6	logger = logging.getLogger("fsspec")
	7
	8
	9	class BaseCache(object):
	10	"""Pass-though cache: doesn't keep anything, calls every time
	11
	12	Acts as base class for other cachers
	13
	14	Parameters
	15	----------
	16	blocksize: int
	17	How far to read ahead in numbers of bytes
	18	fetcher: func
	19	Function of the form f(start, end) which gets bytes from remote as
	20	specified
	21	size: int
	22	How big this file is
	23	"""
	24
	25	def __init__(self, blocksize, fetcher, size):
	26	self.blocksize = blocksize
	27	self.fetcher = fetcher
	28	self.size = size
	29
	30	def _fetch(self, start, end):
	31	return self.fetcher(start, end)
	32
	33	def __getitem__(self, item: slice):
	34	if not isinstance(item, slice):
	35	raise TypeError(
	36	"Cache indices must be a contiguous slice. Got {} instead.".format(
	37	type(item)
	38	)
	39	)
	40	if item.step and item.step != 1:
	41	raise ValueError(
	42	"Cache indices must be a contiguous slice. 'item' has step={}".format(
	43	item.step
	44	)
	45	)
	46
	47	# handle endpoints
	48	if item.start is None:
	49	item = slice(0, item.stop)
	50	elif item.start < 0:
	51	item = slice(self.size + item.start, item.stop)
	52	if item.stop is None:
	53	item = slice(item.start, self.size)
	54	elif item.stop < 0:
	55	item = slice(item.start, self.size + item.stop)
	56
	57	return self._fetch(item.start, item.stop)
	58
	59
	60	class MMapCache(BaseCache):
	61	"""memory-mapped sparse file cache
	62
	63	Opens temporary file, which is filled blocks-wise when data is requested.
	64	Ensure there is enough disc space in the temporary location.
	65
	66	This cache method might only work on posix
	67	"""
	68
	69	def __init__(self, blocksize, fetcher, size, location=None, blocks=None):
	70	super().__init__(blocksize, fetcher, size)
	71	self.blocks = set() if blocks is None else blocks
	72	self.location = location
	73	self.cache = self._makefile()
	74
	75	def _makefile(self):
	76	import tempfile
	77	import mmap
	78
	79	if self.size == 0:
	80	return bytearray()
	81
	82	# posix version
	83	if self.location is None or not os.path.exists(self.location):
	84	if self.location is None:
	85	fd = tempfile.TemporaryFile()
	86	self.blocks = set()
	87	else:
	88	fd = io.open(self.location, "wb+")
	89	fd.seek(self.size - 1)
	90	fd.write(b"1")
	91	fd.flush()
	92	else:
	93	fd = io.open(self.location, "rb+")
	94
	95	return mmap.mmap(fd.fileno(), self.size)
	96
	97	def _fetch(self, start, end):
	98	start_block = start // self.blocksize
	99	end_block = end // self.blocksize
	100	need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
	101	while need:
	102	# TODO: not a for loop so we can consolidate blocks later to
	103	# make fewer fetch calls; this could be parallel
	104	i = need.pop(0)
	105	sstart = i * self.blocksize
	106	send = min(sstart + self.blocksize, self.size)
	107	self.cache[sstart:send] = self.fetcher(sstart, send)
	108	self.blocks.add(i)
	109
	110	return self.cache[start:end]
	111
	112	def __getstate__(self):
	113	state = self.__dict__.copy()
	114	# Remove the unpicklable entries.
	115	del state["cache"]
	116	return state
	117
	118	def __setstate__(self, state):
	119	# Restore instance attributes
	120	self.__dict__.update(state)
	121	self.cache = self._makefile()
	122
	123
	124	class ReadAheadCache(BaseCache):
	125	""" Cache which reads only when we get beyond a block of data
	126
	127	This is a much simpler version of BytesCache, and does not attempt to
	128	fill holes in the cache or keep fragments alive. It is best suited to
	129	many small reads in a sequential order (e.g., reading lines from a file).
	130	"""
	131
	132	def __init__(self, blocksize, fetcher, size):
	133	super().__init__(blocksize, fetcher, size)
	134	self.cache = b""
	135	self.start = 0
	136	self.end = 0
	137
	138	def _fetch(self, start, end):
	139	end = min(self.size, end)
	140	l = end - start
	141	if start >= self.size:
	142	return b""
	143	elif start >= self.start and end <= self.end:
	144	# cache hit
	145	return self.cache[start - self.start : end - self.start]
	146	elif self.start <= start < self.end:
	147	# partial hit
	148	part = self.cache[start - self.start :]
	149	l -= len(part)
	150	start = self.end
	151	else:
	152	# miss
	153	part = b""
	154	end = min(self.size, end + self.blocksize)
	155	self.cache = self.fetcher(start, end) # new block replaces old
	156	self.start = start
	157	self.end = self.start + len(self.cache)
	158	return part + self.cache[:l]
	159
	160
	161	class BlockCache(BaseCache):
	162	"""
	163	Cache holding memory as a set of blocks.
	164
	165	Requests are only ever made `blocksize` at a time, and are
	166	stored in an LRU cache. The least recently accessed block is
	167	discarded when more than `maxblocks` are stored.
	168
	169	Parameters
	170	----------
	171	blocksize : int
	172	The number of bytes to store in each block.
	173	Requests are only ever made for `blocksize`, so this
	174	should balance the overhead of making a request against
	175	the granularity of the blocks.
	176	fetcher : Callable
	177	size : int
	178	The total size of the file being cached.
	179	maxblocks : int
	180	The maximum number of blocks to cache for. The maximum memory
	181	use for this cache is then ``blocksize * maxblocks``.
	182	"""
	183
	184	def __init__(self, blocksize, fetcher, size, maxblocks=32):
	185	super().__init__(blocksize, fetcher, size)
	186	self.nblocks = math.ceil(size / blocksize)
	187	self.maxblocks = maxblocks
	188	self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
	189
	190	def __repr__(self):
	191	return "<BlockCache blocksize={}, size={}, nblocks={}>".format(
	192	self.blocksize, self.size, self.nblocks
	193	)
	194
	195	def cache_info(self):
	196	"""
	197	The statistics on the block cache.
	198
	199	Returns
	200	----------
	201	NamedTuple
	202	Returned directly from the LRU Cache used internally.
	203	"""
	204	return self._fetch_block_cached.cache_info()
	205
	206	def __getstate__(self):
	207	state = self.__dict__
	208	del state["_fetch_block_cached"]
	209	return state
	210
	211	def __setstate__(self, state):
	212	self.__dict__.update(state)
	213	self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
	214	self._fetch_block
	215	)
	216
	217	def _fetch(self, start, end):
	218	if end < start:
	219	raise ValueError(
	220	"'end' ({}) is smaller than 'start' ({}).".format(end, start)
	221	)
	222
	223	if end > self.size:
	224	raise ValueError("'end={}' larger than size ('{}')".format(end, self.size))
	225
	226	# byte position -> block numbers
	227	start_block_number = start // self.blocksize
	228	end_block_number = end // self.blocksize
	229
	230	# these are cached, so safe to do multiple calls for the same start and end.
	231	for block_number in range(start_block_number, end_block_number + 1):
	232	self._fetch_block(block_number)
	233
	234	return self._read_cache(
	235	start,
	236	end,
	237	start_block_number=start_block_number,
	238	end_block_number=end_block_number,
	239	)
	240
	241	def _fetch_block(self, block_number):
	242	"""
	243	Fetch the block of data for `block_number`.
	244	"""
	245	if block_number > self.nblocks:
	246	raise ValueError(
	247	"'block_number={}' is greater than the number of blocks ({})".format(
	248	block_number, self.nblocks
	249	)
	250	)
	251
	252	start = block_number * self.blocksize
	253	end = start + self.blocksize
	254	logger.info("BlockCache fetching block %d", block_number)
	255	block_contents = super()._fetch(start, end)
	256	return block_contents
	257
	258	def _read_cache(self, start, end, start_block_number, end_block_number):
	259	"""
	260	Read from our block cache.
	261
	262	Parameters
	263	----------
	264	start, end : int
	265	The start and end byte positions.
	266	start_block_number, end_block_number : int
	267	The start and end block numbers.
	268	"""
	269	start_pos = start % self.blocksize
	270	end_pos = end % self.blocksize
	271
	272	if start_block_number == end_block_number:
	273	block = self._fetch_block_cached(start_block_number)
	274	return block[start_pos:end_pos]
	275
	276	else:
	277	# read from the initial
	278	out = []
	279	out.append(self._fetch_block_cached(start_block_number)[start_pos:])
	280
	281	# intermediate blocks
	282	# Note: it'd be nice to combine these into one big request. However
	283	# that doesn't play nicely with our LRU cache.
	284	for block_number in range(start_block_number + 1, end_block_number):
	285	out.append(self._fetch_block_cached(block_number))
	286
	287	# final block
	288	out.append(self._fetch_block_cached(end_block_number)[:end_pos])
	289
	290	return b"".join(out)
	291
	292
	293	class BytesCache(BaseCache):
	294	"""Cache which holds data in a in-memory bytes object
	295
	296	Implements read-ahead by the block size, for semi-random reads progressing
	297	through the file.
	298
	299	Parameters
	300	----------
	301	trim: bool
	302	As we read more data, whether to discard the start of the buffer when
	303	we are more than a blocksize ahead of it.
	304	"""
	305
	306	def __init__(self, blocksize, fetcher, size, trim=True):
	307	super().__init__(blocksize, fetcher, size)
	308	self.cache = b""
	309	self.start = None
	310	self.end = None
	311	self.trim = trim
	312
	313	def _fetch(self, start, end):
	314	# TODO: only set start/end after fetch, in case it fails?
	315	# is this where retry logic might go?
	316	if (
	317	self.start is not None
	318	and start >= self.start
	319	and self.end is not None
	320	and end < self.end
	321	):
	322	# cache hit: we have all the required data
	323	offset = start - self.start
	324	return self.cache[offset : offset + end - start]
	325
	326	if self.blocksize:
	327	bend = min(self.size, end + self.blocksize)
	328	else:
	329	bend = end
	330
	331	if bend == start or start > self.size:
	332	return b""
	333
	334	if (self.start is None or start < self.start) and (
	335	self.end is None or end > self.end
	336	):
	337	# First read, or extending both before and after
	338	self.cache = self.fetcher(start, bend)
	339	self.start = start
	340	elif start < self.start:
	341	if self.end - end > self.blocksize:
	342	self.cache = self.fetcher(start, bend)
	343	self.start = start
	344	else:
	345	new = self.fetcher(start, self.start)
	346	self.start = start
	347	self.cache = new + self.cache
	348	elif bend > self.end:
	349	if self.end > self.size:
	350	pass
	351	elif end - self.end > self.blocksize:
	352	self.cache = self.fetcher(start, bend)
	353	self.start = start
	354	else:
	355	new = self.fetcher(self.end, bend)
	356	self.cache = self.cache + new
	357
	358	self.end = self.start + len(self.cache)
	359	offset = start - self.start
	360	out = self.cache[offset : offset + end - start]
	361	if self.trim:
	362	num = (self.end - self.start) // (self.blocksize + 1)
	363	if num > 1:
	364	self.start += self.blocksize * num
	365	self.cache = self.cache[self.blocksize * num :]
	366	return out
	367
	368	def __len__(self):
	369	return len(self.cache)
	370
	371
	372	caches = {
	373	"none": BaseCache,
	374	"mmap": MMapCache,
	375	"bytes": BytesCache,
	376	"readahead": ReadAheadCache,
	377	"block": BlockCache,
	378	}

+152

-0

fsspec/compression.py less more

	0	"""Helper functions for a standard streaming compression API"""
	1	from bz2 import BZ2File
	2	from gzip import GzipFile
	3	from zipfile import ZipFile
	4
	5	import fsspec.utils
	6	from fsspec.spec import AbstractBufferedFile
	7
	8
	9	def noop_file(file, mode, **kwargs):
	10	return file
	11
	12
	13	# should be functions of the form func(infile, mode=, **kwargs) -> file-like
	14	compr = {None: noop_file}
	15
	16
	17	def register_compression(name, callback, extensions, force=False):
	18	"""Register an "inferable" file compression type.
	19
	20	Registers transparent file compression type for use with fsspec.open.
	21	Compression can be specified by name in open, or "infer"-ed for any files
	22	ending with the given extensions.
	23
	24	Args:
	25	name: (str) The compression type name. Eg. "gzip".
	26	callback: A callable of form (infile, mode, **kwargs) -> file-like.
	27	Accepts an input file-like object, the target mode and kwargs.
	28	Returns a wrapped file-like object.
	29	extensions: (str, Iterable[str]) A file extension, or list of file
	30	extensions for which to infer this compression scheme. Eg. "gz".
	31	force: (bool) Force re-registration of compression type or extensions.
	32
	33	Raises:
	34	ValueError: If name or extensions already registered, and not force.
	35
	36	"""
	37	if isinstance(extensions, str):
	38	extensions = [extensions]
	39
	40	# Validate registration
	41	if name in compr and not force:
	42	raise ValueError("Duplicate compression registration: %s" % name)
	43
	44	for ext in extensions:
	45	if ext in fsspec.utils.compressions and not force:
	46	raise ValueError(
	47	"Duplicate compression file extension: %s (%s)" % (ext, name)
	48	)
	49
	50	compr[name] = callback
	51
	52	for ext in extensions:
	53	fsspec.utils.compressions[ext] = name
	54
	55
	56	def unzip(infile, mode="rb", filename=None, **kwargs):
	57	if "r" not in mode:
	58	filename = filename or "file"
	59	z = ZipFile(infile, mode="w", **kwargs)
	60	fo = z.open(filename, mode="w")
	61	fo.close = lambda closer=fo.close: closer() or z.close()
	62	return fo
	63	z = ZipFile(infile)
	64	if filename is None:
	65	filename = z.namelist()[0]
	66	return z.open(filename, mode="r", **kwargs)
	67
	68
	69	register_compression("zip", unzip, "zip")
	70	register_compression("bz2", BZ2File, "bz2")
	71	register_compression("gzip", lambda f, kwargs: GzipFile(fileobj=f, kwargs), "gz")
	72
	73	try:
	74	import lzma
	75
	76	register_compression("lzma", lzma.LZMAFile, "xz")
	77	register_compression("xz", lzma.LZMAFile, "xz", force=True)
	78	except ImportError:
	79	pass
	80
	81	try:
	82	import lzmaffi
	83
	84	register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True)
	85	register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
	86	except ImportError:
	87	pass
	88
	89
	90	class SnappyFile(AbstractBufferedFile):
	91	def __init__(self, infile, mode, **kwargs):
	92	import snappy
	93
	94	self.details = {"size": 999999999} # not true, but OK if we don't seek
	95	super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs)
	96	self.infile = infile
	97	if "r" in mode:
	98	self.codec = snappy.StreamDecompressor()
	99	else:
	100	self.codec = snappy.StreamCompressor()
	101
	102	def _upload_chunk(self, final=False):
	103	self.buffer.seek(0)
	104	out = self.codec.add_chunk(self.buffer.read())
	105	self.infile.write(out)
	106	return True
	107
	108	def seek(self, loc, whence=0):
	109	raise NotImplementedError("SnappyFile is not seekable")
	110
	111	def seekable(self):
	112	return False
	113
	114	def _fetch_range(self, start, end):
	115	"""Get the specified set of bytes from remote"""
	116	data = self.infile.read(end - start)
	117	return self.codec.decompress(data)
	118
	119
	120	try:
	121	import snappy
	122
	123	snappy.compress
	124	# Snappy may use the .sz file extension, but this is not part of the
	125	# standard implementation.
	126	register_compression("snappy", SnappyFile, [])
	127
	128	except (ImportError, NameError):
	129	pass
	130
	131	try:
	132	import lz4.frame
	133
	134	register_compression("lz4", lz4.frame.open, "lz4")
	135	except ImportError:
	136	pass
	137
	138	try:
	139	import zstandard as zstd
	140
	141	def zstandard_file(infile, mode="rb"):
	142	if "r" in mode:
	143	cctx = zstd.ZstdDecompressor()
	144	return cctx.stream_reader(infile)
	145	else:
	146	cctx = zstd.ZstdCompressor(level=10)
	147	return cctx.stream_writer(infile)
	148
	149	register_compression("zstd", zstandard_file, "zst")
	150	except ImportError:
	151	pass

+51

-0

fsspec/conftest.py less more

	0	import os
	1	import shutil
	2	import subprocess
	3	import sys
	4	import time
	5
	6	import pytest
	7
	8	import fsspec
	9	from fsspec.implementations.cached import CachingFileSystem
	10
	11
	12	@pytest.fixture()
	13	def m():
	14	"""
	15	Fixture providing a memory filesystem.
	16	"""
	17	m = fsspec.filesystem("memory")
	18	m.store.clear()
	19	try:
	20	yield m
	21	finally:
	22	m.store.clear()
	23
	24
	25	@pytest.fixture
	26	def ftp_writable(tmpdir):
	27	"""
	28	Fixture providing a writable FTP filesystem.
	29	"""
	30	pytest.importorskip("pyftpdlib")
	31	from fsspec.implementations.ftp import FTPFileSystem
	32
	33	FTPFileSystem.clear_instance_cache() # remove lingering connections
	34	CachingFileSystem.clear_instance_cache()
	35	d = str(tmpdir)
	36	with open(os.path.join(d, "out"), "wb") as f:
	37	f.write(b"hello" * 10000)
	38	P = subprocess.Popen(
	39	[sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
	40	)
	41	try:
	42	time.sleep(1)
	43	yield "localhost", 2121, "user", "pass"
	44	finally:
	45	P.terminate()
	46	P.wait()
	47	try:
	48	shutil.rmtree(tmpdir)
	49	except Exception:
	50	pass

+440

-0

fsspec/core.py less more

	0	from __future__ import print_function, division, absolute_import
	1
	2	import io
	3	import os
	4	import logging
	5	from .compression import compr
	6	from .utils import (
	7	infer_compression,
	8	build_name_function,
	9	update_storage_options,
	10	stringify_path,
	11	)
	12	from .registry import get_filesystem_class
	13
	14	# for backwards compat, we export cache things from here too
	15	from .caching import ( # noqa: F401
	16	BaseCache,
	17	MMapCache,
	18	ReadAheadCache,
	19	BytesCache,
	20	BlockCache,
	21	caches,
	22	)
	23
	24	logger = logging.getLogger("fsspec")
	25
	26
	27	class OpenFile(object):
	28	"""
	29	File-like object to be used in a context
	30
	31	Can layer (buffered) text-mode and compression over any file-system, which
	32	are typically binary-only.
	33
	34	These instances are safe to serialize, as the low-level file object
	35	is not created until invoked using `with`.
	36
	37	Parameters
	38	----------
	39	fs: FileSystem
	40	The file system to use for opening the file. Should match the interface
	41	of ``dask.bytes.local.LocalFileSystem``.
	42	path: str
	43	Location to open
	44	mode: str like 'rb', optional
	45	Mode of the opened file
	46	compression: str or None, optional
	47	Compression to apply
	48	encoding: str or None, optional
	49	The encoding to use if opened in text mode.
	50	errors: str or None, optional
	51	How to handle encoding errors if opened in text mode.
	52	newline: None or str
	53	Passed to TextIOWrapper in text mode, how to handle line endings.
	54	"""
	55
	56	def __init__(
	57	self,
	58	fs,
	59	path,
	60	mode="rb",
	61	compression=None,
	62	encoding=None,
	63	errors=None,
	64	newline=None,
	65	):
	66	self.fs = fs
	67	self.path = path
	68	self.mode = mode
	69	self.compression = get_compression(path, compression)
	70	self.encoding = encoding
	71	self.errors = errors
	72	self.newline = newline
	73	self.fobjects = []
	74
	75	def __reduce__(self):
	76	return (
	77	OpenFile,
	78	(
	79	self.fs,
	80	self.path,
	81	self.mode,
	82	self.compression,
	83	self.encoding,
	84	self.errors,
	85	),
	86	)
	87
	88	def __repr__(self):
	89	return "<OpenFile '{}'>".format(self.path)
	90
	91	def __fspath__(self):
	92	return self.path
	93
	94	def __enter__(self):
	95	mode = self.mode.replace("t", "").replace("b", "") + "b"
	96
	97	f = self.fs.open(self.path, mode=mode)
	98
	99	self.fobjects = [f]
	100
	101	if self.compression is not None:
	102	compress = compr[self.compression]
	103	f = compress(f, mode=mode[0])
	104	self.fobjects.append(f)
	105
	106	if "b" not in self.mode:
	107	# assume, for example, that 'r' is equivalent to 'rt' as in builtin
	108	f = io.TextIOWrapper(
	109	f, encoding=self.encoding, errors=self.errors, newline=self.newline
	110	)
	111	self.fobjects.append(f)
	112
	113	return self.fobjects[-1]
	114
	115	def __exit__(self, *args):
	116	self.close()
	117
	118	def __del__(self):
	119	self.close()
	120
	121	def open(self):
	122	"""Materialise this as a real open file without context
	123
	124	The file should be explicitly closed to avoid enclosed open file
	125	instances persisting
	126	"""
	127	return self.__enter__()
	128
	129	def close(self):
	130	"""Close all encapsulated file objects"""
	131	for f in reversed(self.fobjects):
	132	if "r" not in self.mode and not f.closed:
	133	f.flush()
	134	f.close()
	135	self.fobjects = []
	136
	137
	138	def open_files(
	139	urlpath,
	140	mode="rb",
	141	compression=None,
	142	encoding="utf8",
	143	errors=None,
	144	name_function=None,
	145	num=1,
	146	protocol=None,
	147	newline=None,
	148	**kwargs
	149	):
	150	""" Given a path or paths, return a list of ``OpenFile`` objects.
	151
	152	For writing, a str path must contain the "*" character, which will be filled
	153	in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
	154
	155	For either reading or writing, can instead provide explicit list of paths.
	156
	157	Parameters
	158	----------
	159	urlpath: string or list
	160	Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
	161	to read from alternative filesystems. To read from multiple files you
	162	can pass a globstring or a list of paths, with the caveat that they
	163	must all have the same protocol.
	164	mode: 'rb', 'wt', etc.
	165	compression: string
	166	Compression to use. See ``dask.bytes.compression.files`` for options.
	167	encoding: str
	168	For text mode only
	169	errors: None or str
	170	Passed to TextIOWrapper in text mode
	171	name_function: function or None
	172	if opening a set of files for writing, those files do not yet exist,
	173	so we need to generate their names by formatting the urlpath for
	174	each sequence number
	175	num: int [1]
	176	if writing mode, number of files we expect to create (passed to
	177	name+function)
	178	protocol: str or None
	179	If given, overrides the protocol found in the URL.
	180	newline: bytes or None
	181	Used for line terminator in text mode. If None, uses system default;
	182	if blank, uses no translation.
	183	**kwargs: dict
	184	Extra options that make sense to a particular storage connection, e.g.
	185	host, port, username, password, etc.
	186
	187	Examples
	188	--------
	189	>>> files = open_files('2015--.csv') # doctest: +SKIP
	190	>>> files = open_files(
	191	... 's3://bucket/2015--.csv.gz', compression='gzip'
	192	... ) # doctest: +SKIP
	193
	194	Returns
	195	-------
	196	List of ``OpenFile`` objects.
	197	"""
	198	fs, fs_token, paths = get_fs_token_paths(
	199	urlpath,
	200	mode,
	201	num=num,
	202	name_function=name_function,
	203	storage_options=kwargs,
	204	protocol=protocol,
	205	)
	206	return [
	207	OpenFile(
	208	fs,
	209	path,
	210	mode=mode,
	211	compression=compression,
	212	encoding=encoding,
	213	errors=errors,
	214	newline=newline,
	215	)
	216	for path in paths
	217	]
	218
	219
	220	def open(
	221	urlpath,
	222	mode="rb",
	223	compression=None,
	224	encoding="utf8",
	225	errors=None,
	226	protocol=None,
	227	newline=None,
	228	**kwargs
	229	):
	230	""" Given a path or paths, return one ``OpenFile`` object.
	231
	232	Parameters
	233	----------
	234	urlpath: string or list
	235	Absolute or relative filepath. Prefix with a protocol like ``s3://``
	236	to read from alternative filesystems. Should not include glob
	237	character(s).
	238	mode: 'rb', 'wt', etc.
	239	compression: string
	240	Compression to use. See ``dask.bytes.compression.files`` for options.
	241	encoding: str
	242	For text mode only
	243	errors: None or str
	244	Passed to TextIOWrapper in text mode
	245	protocol: str or None
	246	If given, overrides the protocol found in the URL.
	247	newline: bytes or None
	248	Used for line terminator in text mode. If None, uses system default;
	249	if blank, uses no translation.
	250	**kwargs: dict
	251	Extra options that make sense to a particular storage connection, e.g.
	252	host, port, username, password, etc.
	253
	254	Examples
	255	--------
	256	>>> openfile = open('2015-01-01.csv') # doctest: +SKIP
	257	>>> openfile = open(
	258	... 's3://bucket/2015-01-01.csv.gz',
	259	... compression='gzip'
	260	... ) # doctest: +SKIP
	261	>>> with openfile as f:
	262	... df = pd.read_csv(f) # doctest: +SKIP
	263
	264	Returns
	265	-------
	266	``OpenFile`` object.
	267	"""
	268	return open_files(
	269	[urlpath],
	270	mode,
	271	compression,
	272	encoding,
	273	errors,
	274	protocol,
	275	newline=newline,
	276	**kwargs
	277	)[0]
	278
	279
	280	def get_compression(urlpath, compression):
	281	if compression == "infer":
	282	compression = infer_compression(urlpath)
	283	if compression is not None and compression not in compr:
	284	raise ValueError("Compression type %s not supported" % compression)
	285	return compression
	286
	287
	288	def split_protocol(urlpath):
	289	"""Return protocol, path pair"""
	290	urlpath = stringify_path(urlpath)
	291	if "://" in urlpath:
	292	protocol, path = urlpath.split("://", 1)
	293	if len(protocol) > 1:
	294	# excludes Windows paths
	295	return protocol, path
	296	return None, urlpath
	297
	298
	299	def strip_protocol(urlpath):
	300	"""Return only path part of full URL, according to appropriate backend"""
	301	protocol, _ = split_protocol(urlpath)
	302	cls = get_filesystem_class(protocol)
	303	return cls._strip_protocol(urlpath)
	304
	305
	306	def expand_paths_if_needed(paths, mode, num, fs, name_function):
	307	"""Expand paths if they have a ``*`` in them.
	308
	309	:param paths: list of paths
	310	mode: str
	311	Mode in which to open files.
	312	num: int
	313	If opening in writing mode, number of files we expect to create.
	314	fs: filesystem object
	315	name_function: callable
	316	If opening in writing mode, this callable is used to generate path
	317	names. Names are generated for each partition by
	318	``urlpath.replace('*', name_function(partition_index))``.
	319	:return: list of paths
	320	"""
	321	expanded_paths = []
	322	paths = list(paths)
	323	if "w" in mode and sum([1 for p in paths if "*" in p]) > 1:
	324	raise ValueError("When writing data, only one filename mask can be specified.")
	325	elif "w" in mode:
	326	num = max(num, len(paths))
	327	for curr_path in paths:
	328	if "*" in curr_path:
	329	if "w" in mode:
	330	# expand using name_function
	331	expanded_paths.extend(_expand_paths(curr_path, name_function, num))
	332	else:
	333	# expand using glob
	334	expanded_paths.extend(fs.glob(curr_path))
	335	else:
	336	expanded_paths.append(curr_path)
	337	# if we generated more paths that asked for, trim the list
	338	if "w" in mode and len(expanded_paths) > num:
	339	expanded_paths = expanded_paths[:num]
	340	return expanded_paths
	341
	342
	343	def get_fs_token_paths(
	344	urlpath, mode="rb", num=1, name_function=None, storage_options=None, protocol=None
	345	):
	346	"""Filesystem, deterministic token, and paths from a urlpath and options.
	347
	348	Parameters
	349	----------
	350	urlpath: string or iterable
	351	Absolute or relative filepath, URL (may include protocols like
	352	``s3://``), or globstring pointing to data.
	353	mode: str, optional
	354	Mode in which to open files.
	355	num: int, optional
	356	If opening in writing mode, number of files we expect to create.
	357	name_function: callable, optional
	358	If opening in writing mode, this callable is used to generate path
	359	names. Names are generated for each partition by
	360	``urlpath.replace('*', name_function(partition_index))``.
	361	storage_options: dict, optional
	362	Additional keywords to pass to the filesystem class.
	363	protocol: str or None
	364	To override the protocol specifier in the URL
	365	"""
	366	if isinstance(urlpath, (list, tuple)):
	367	if not urlpath:
	368	raise ValueError("empty urlpath sequence")
	369	protocols, paths = zip(*map(split_protocol, urlpath))
	370	protocol = protocol or protocols[0]
	371	if not all(p == protocol for p in protocols):
	372	raise ValueError(
	373	"When specifying a list of paths, all paths must "
	374	"share the same protocol"
	375	)
	376	cls = get_filesystem_class(protocol)
	377	optionss = list(map(cls._get_kwargs_from_urls, urlpath))
	378	paths = [cls._strip_protocol(u) for u in urlpath]
	379	options = optionss[0]
	380	if not all(o == options for o in optionss):
	381	raise ValueError(
	382	"When specifying a list of paths, all paths must "
	383	"share the same file-system options"
	384	)
	385	update_storage_options(options, storage_options)
	386	fs = cls(**options)
	387	paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
	388
	389	elif isinstance(urlpath, str) or hasattr(urlpath, "name"):
	390	protocols, path = split_protocol(urlpath)
	391	protocol = protocol or protocols
	392	cls = get_filesystem_class(protocol)
	393
	394	options = cls._get_kwargs_from_urls(urlpath)
	395	path = cls._strip_protocol(urlpath)
	396	update_storage_options(options, storage_options)
	397	fs = cls(**options)
	398
	399	if "w" in mode:
	400	paths = _expand_paths(path, name_function, num)
	401	elif "*" in path:
	402	paths = sorted(fs.glob(path))
	403	else:
	404	paths = [path]
	405
	406	else:
	407	raise TypeError("url type not understood: %s" % urlpath)
	408
	409	return fs, fs._fs_token, paths
	410
	411
	412	def _expand_paths(path, name_function, num):
	413	if isinstance(path, str):
	414	if path.count("*") > 1:
	415	raise ValueError("Output path spec must contain exactly one '*'.")
	416	elif "*" not in path:
	417	path = os.path.join(path, "*.part")
	418
	419	if name_function is None:
	420	name_function = build_name_function(num - 1)
	421
	422	paths = [path.replace("*", name_function(i)) for i in range(num)]
	423	if paths != sorted(paths):
	424	logger.warning(
	425	"In order to preserve order between partitions"
	426	" paths created with ``name_function`` should "
	427	"sort to partition order"
	428	)
	429	elif isinstance(path, (tuple, list)):
	430	assert len(path) == num
	431	paths = list(path)
	432	else:
	433	raise ValueError(
	434	"Path should be either\n"
	435	"1. A list of paths: ['foo.json', 'bar.json', ...]\n"
	436	"2. A directory: 'foo/\n"
	437	"3. A path with a '' in it: 'foo..json'"
	438	)
	439	return paths

+157

-0

fsspec/fuse.py less more

	0	from __future__ import print_function
	1	import os
	2	import stat
	3	from errno import ENOENT, EIO
	4	from fuse import Operations, FuseOSError
	5	import threading
	6	import time
	7	from fuse import FUSE
	8
	9
	10	class FUSEr(Operations):
	11	def __init__(self, fs, path):
	12	self.fs = fs
	13	self.cache = {}
	14	self.root = path.rstrip("/") + "/"
	15	self.counter = 0
	16
	17	def getattr(self, path, fh=None):
	18	path = "".join([self.root, path.lstrip("/")]).rstrip("/")
	19	try:
	20	info = self.fs.info(path)
	21	except FileNotFoundError:
	22	raise FuseOSError(ENOENT)
	23	data = {"st_uid": 1000, "st_gid": 1000}
	24	perm = 0o777
	25
	26	if info["type"] != "file":
	27	data["st_mode"] = stat.S_IFDIR \| perm
	28	data["st_size"] = 0
	29	data["st_blksize"] = 0
	30	else:
	31	data["st_mode"] = stat.S_IFREG \| perm
	32	data["st_size"] = info["size"]
	33	data["st_blksize"] = 5 * 2 ** 20
	34	data["st_nlink"] = 1
	35	data["st_atime"] = time.time()
	36	data["st_ctime"] = time.time()
	37	data["st_mtime"] = time.time()
	38	return data
	39
	40	def readdir(self, path, fh):
	41	path = "".join([self.root, path.lstrip("/")])
	42	files = self.fs.ls(path, False)
	43	files = [os.path.basename(f.rstrip("/")) for f in files]
	44	return [".", ".."] + files
	45
	46	def mkdir(self, path, mode):
	47	path = "".join([self.root, path.lstrip("/")])
	48	self.fs.mkdir(path)
	49	return 0
	50
	51	def rmdir(self, path):
	52	path = "".join([self.root, path.lstrip("/")])
	53	self.fs.rmdir(path)
	54	return 0
	55
	56	def read(self, path, size, offset, fh):
	57	f = self.cache[fh]
	58	f.seek(offset)
	59	out = f.read(size)
	60	return out
	61
	62	def write(self, path, data, offset, fh):
	63	f = self.cache[fh]
	64	f.write(data)
	65	return len(data)
	66
	67	def create(self, path, flags, fi=None):
	68	fn = "".join([self.root, path.lstrip("/")])
	69	f = self.fs.open(fn, "wb")
	70	self.cache[self.counter] = f
	71	self.counter += 1
	72	return self.counter - 1
	73
	74	def open(self, path, flags):
	75	fn = "".join([self.root, path.lstrip("/")])
	76	if flags % 2 == 0:
	77	# read
	78	mode = "rb"
	79	else:
	80	# write/create
	81	mode = "wb"
	82	self.cache[self.counter] = self.fs.open(fn, mode)
	83	self.counter += 1
	84	return self.counter - 1
	85
	86	def truncate(self, path, length, fh=None):
	87	fn = "".join([self.root, path.lstrip("/")])
	88	if length != 0:
	89	raise NotImplementedError
	90	# maybe should be no-op since open with write sets size to zero anyway
	91	self.fs.touch(fn)
	92
	93	def unlink(self, path):
	94	fn = "".join([self.root, path.lstrip("/")])
	95	try:
	96	self.fs.rm(fn, False)
	97	except (IOError, FileNotFoundError):
	98	raise FuseOSError(EIO)
	99
	100	def release(self, path, fh):
	101	try:
	102	if fh in self.cache:
	103	f = self.cache[fh]
	104	f.close()
	105	self.cache.pop(fh)
	106	except Exception as e:
	107	print(e)
	108	return 0
	109
	110	def chmod(self, path, mode):
	111	raise NotImplementedError
	112
	113
	114	def run(fs, path, mount_point, foreground=True, threads=False):
	115	""" Mount stuff in a local directory
	116
	117	This uses fusepy to make it appear as if a given path on an fsspec
	118	instance is in fact resident within the local file-system.
	119
	120	This requires that fusepy by installed, and that FUSE be available on
	121	the system (typically requiring a package to be installed with
	122	apt, yum, brew, etc.).
	123
	124	Parameters
	125	----------
	126	fs: file-system instance
	127	From one of the compatible implementations
	128	path: str
	129	Location on that file-system to regard as the root directory to
	130	mount. Note that you typically should include the terminating "/"
	131	character.
	132	mount_point: str
	133	An empty directory on the local file-system where the contents of
	134	the remote path will appear
	135	foreground: bool
	136	Whether or not calling this function will block. Operation will
	137	typically be more stable if True.
	138	threads: bool
	139	Whether or not to create threads when responding to file operations
	140	within the mounter directory. Operation will typically be more
	141	stable if False.
	142
	143	"""
	144	func = lambda: FUSE(
	145	FUSEr(fs, path), mount_point, nothreads=not threads, foreground=True
	146	)
	147	if foreground is False:
	148	th = threading.Thread(target=func)
	149	th.daemon = True
	150	th.start()
	151	return th
	152	else: # pragma: no cover
	153	try:
	154	func()
	155	except KeyboardInterrupt:
	156	pass

-0

fsspec/implementations/__init__.py less more

(New empty file)

+358

-0

fsspec/implementations/cached.py less more

	0	import time
	1	import pickle
	2	import logging
	3	import os
	4	import hashlib
	5	import tempfile
	6	import inspect
	7	from fsspec import AbstractFileSystem, filesystem
	8	from fsspec.spec import AbstractBufferedFile
	9	from fsspec.core import MMapCache, BaseCache
	10
	11	logger = logging.getLogger("fsspec")
	12
	13
	14	class CachingFileSystem(AbstractFileSystem):
	15	"""Locally caching filesystem, layer over any other FS
	16
	17	This class implements chunk-wise local storage of remote files, for quick
	18	access after the initial download. The files are stored in a given
	19	directory with random hashes for the filenames. If no directory is given,
	20	a temporary one is used, which should be cleaned up by the OS after the
	21	process ends. The files themselves as sparse (as implemented in
	22	MMapCache), so only the data which is accessed takes up space.
	23
	24	Restrictions:
	25
	26	- the block-size must be the same for each access of a given file, unless
	27	all blocks of the file have already been read
	28	- caching can only be applied to file-systems which produce files
	29	derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
	30	allowed, for testing
	31	"""
	32
	33	protocol = ("blockcache", "cached")
	34
	35	def __init__(
	36	self,
	37	target_protocol=None,
	38	cache_storage="TMP",
	39	cache_check=10,
	40	check_files=False,
	41	expiry_time=604800,
	42	target_options=None,
	43	**kwargs
	44	):
	45	"""
	46
	47	Parameters
	48	----------
	49	target_protocol: str
	50	Target fielsystem protocol
	51	cache_storage: str or list(str)
	52	Location to store files. If "TMP", this is a temporary directory,
	53	and will be cleaned up by the OS when this process ends (or later).
	54	If a list, each location will be tried in the order given, but
	55	only the last will be considered writable.
	56	cache_check: int
	57	Number of seconds between reload of cache metadata
	58	check_files: bool
	59	Whether to explicitly see if the UID of the remote file matches
	60	the stored one before using. Warning: some file systems such as
	61	HTTP cannot reliably give a unique hash of the contents of some
	62	path, so be sure to set this option to False.
	63	expiry_time: int
	64	The time in seconds after which a local copy is considered useless.
	65	Set to falsy to prevent expiry. The default is equivalent to one
	66	week.
	67	target_options: dict or None
	68	Passed to the instantiation of the FS, if fs is None.
	69	"""
	70	if self._cached:
	71	return
	72	super().__init__(**kwargs)
	73	if cache_storage == "TMP":
	74	storage = [tempfile.mkdtemp()]
	75	else:
	76	if isinstance(cache_storage, str):
	77	storage = [cache_storage]
	78	else:
	79	storage = cache_storage
	80	os.makedirs(storage[-1], exist_ok=True)
	81	self.storage = storage
	82	self.kwargs = target_options or {}
	83	self.cache_check = cache_check
	84	self.check_files = check_files
	85	self.expiry = expiry_time
	86	self.load_cache()
	87	if isinstance(target_protocol, AbstractFileSystem):
	88	self.fs = target_protocol
	89	self.protocol = self.fs.protocol
	90	else:
	91	self.protocol = target_protocol
	92	self.fs = filesystem(target_protocol, **self.kwargs)
	93
	94	def __reduce_ex__(self, *_):
	95	return (
	96	self.__class__,
	97	(
	98	self.protocol,
	99	self.storage,
	100	self.cache_check,
	101	self.check_files,
	102	self.expiry,
	103	self.kwargs or None,
	104	),
	105	)
	106
	107	def load_cache(self):
	108	"""Read set of stored blocks from file"""
	109	cached_files = []
	110	for storage in self.storage:
	111	fn = os.path.join(storage, "cache")
	112	if os.path.exists(fn):
	113	with open(fn, "rb") as f:
	114	# TODO: consolidate blocks here
	115	cached_files.append(pickle.load(f))
	116	else:
	117	os.makedirs(storage, exist_ok=True)
	118	cached_files.append({})
	119	self.cached_files = cached_files or [{}]
	120	self.last_cache = time.time()
	121
	122	def save_cache(self):
	123	"""Save set of stored blocks from file"""
	124	fn = os.path.join(self.storage[-1], "cache")
	125	# TODO: a file lock could be used to ensure file does not change
	126	# between re-read and write; but occasional duplicated reads ok.
	127	cache = self.cached_files[-1]
	128	if os.path.exists(fn):
	129	with open(fn, "rb") as f:
	130	cached_files = pickle.load(f)
	131	for k, c in cached_files.items():
	132	if c["blocks"] is not True:
	133	if cache[k]["blocks"] is True:
	134	c["blocks"] = True
	135	else:
	136	c["blocks"] = set(c["blocks"]).union(cache[k]["blocks"])
	137	else:
	138	cached_files = cache
	139	cache = {k: v.copy() for k, v in cached_files.items()}
	140	for c in cache.values():
	141	if isinstance(c["blocks"], set):
	142	c["blocks"] = list(c["blocks"])
	143	with open(fn + ".temp", "wb") as f:
	144	pickle.dump(cache, f)
	145	if os.path.exists(fn):
	146	os.remove(fn)
	147	os.rename(fn + ".temp", fn)
	148
	149	def _check_cache(self):
	150	"""Reload caches if time elapsed or any disappeared"""
	151	if not self.cache_check:
	152	# explicitly told not to bother checking
	153	return
	154	timecond = time.time() - self.last_cache > self.cache_check
	155	existcond = all(os.path.exists(storage) for storage in self.storage)
	156	if timecond or not existcond:
	157	self.load_cache()
	158
	159	def _check_file(self, path):
	160	"""Is path in cache and still valid"""
	161	self._check_cache()
	162	for storage, cache in zip(self.storage, self.cached_files):
	163	if path not in cache:
	164	continue
	165	detail = cache[path].copy()
	166	if self.check_files:
	167	if detail["uid"] != self.fs.ukey(path):
	168	continue
	169	if self.expiry:
	170	if detail["time"] - time.time() > self.expiry:
	171	continue
	172	fn = os.path.join(storage, detail["fn"])
	173	if os.path.exists(fn):
	174	return detail, fn
	175	return False, None
	176
	177	def _open(self, path, mode="rb", **kwargs):
	178	"""Wrap the target _open
	179
	180	If the whole file exists in the cache, just open it locally and
	181	return that.
	182
	183	Otherwise, open the file on the target FS, and make it have a mmap
	184	cache pointing to the location which we determine, in our cache.
	185	The ``blocks`` instance is shared, so as the mmap cache instance
	186	updates, so does the entry in our ``cached_files`` attribute.
	187	We monkey-patch this file, so that when it closes, we call
	188	``close_and_update`` to save the state of the blocks.
	189	"""
	190	path = self._strip_protocol(path)
	191	if not path.startswith(self.protocol):
	192	path = self.protocol + "://" + path
	193	if mode != "rb":
	194	return self.fs._open(path, mode=mode, **kwargs)
	195	detail, fn = self._check_file(path)
	196	if detail:
	197	# file is in cache
	198	hash, blocks = detail["fn"], detail["blocks"]
	199	if blocks is True:
	200	# stored file is complete
	201	logger.debug("Opening local copy of %s" % path)
	202	return open(fn, "rb")
	203	# TODO: action where partial file exists in read-only cache
	204	logger.debug("Opening partially cached copy of %s" % path)
	205	else:
	206	hash = hashlib.sha256(path.encode()).hexdigest()
	207	fn = os.path.join(self.storage[-1], hash)
	208	blocks = set()
	209	detail = {
	210	"fn": hash,
	211	"blocks": blocks,
	212	"time": time.time(),
	213	"uid": self.fs.ukey(path),
	214	}
	215	self.cached_files[-1][path] = detail
	216	logger.debug("Creating local sparse file for %s" % path)
	217	kwargs["cache_type"] = "none"
	218	kwargs["mode"] = mode
	219
	220	# call target filesystems open
	221	f = self.fs._open(path, **kwargs)
	222	if "blocksize" in detail:
	223	if detail["blocksize"] != f.blocksize:
	224	raise ValueError(
	225	"Cached file must be reopened with same block"
	226	"size as original (old: %i, new %i)"
	227	"" % (detail["blocksize"], f.blocksize)
	228	)
	229	else:
	230	detail["blocksize"] = f.blocksize
	231	f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
	232	close = f.close
	233	f.close = lambda: self.close_and_update(f, close)
	234	return f
	235
	236	def close_and_update(self, f, close):
	237	"""Called when a file is closing, so store the set of blocks"""
	238	if f.path.startswith(self.protocol):
	239	path = f.path
	240	else:
	241	path = self.protocol + "://" + f.path
	242	c = self.cached_files[-1][path]
	243	if c["blocks"] is not True and len(["blocks"]) * f.blocksize >= f.size:
	244	c["blocks"] = True
	245	self.save_cache()
	246	close()
	247
	248	def __getattribute__(self, item):
	249	if item in [
	250	"load_cache",
	251	"_open",
	252	"save_cache",
	253	"close_and_update",
	254	"__init__",
	255	"__getattribute__",
	256	"__reduce_ex__",
	257	"open",
	258	"cat",
	259	"get",
	260	"read_block",
	261	"tail",
	262	"head",
	263	"_check_file",
	264	"_check_cache",
	265	]:
	266	# all the methods defined in this class. Note `open` here, since
	267	# it calls `_open`, but is actually in superclass
	268	return lambda args, kw: getattr(type(self), item)(self, args, **kw)
	269	if item == "__class__":
	270	return type(self)
	271	d = object.__getattribute__(self, "__dict__")
	272	fs = d.get("fs", None) # fs is not immediately defined
	273	if item in d:
	274	return d[item]
	275	elif fs is not None:
	276	if item in fs.__dict__:
	277	# attribute of instance
	278	return fs.__dict__[item]
	279	# attributed belonging to the target filesystem
	280	cls = type(fs)
	281	m = getattr(cls, item)
	282	if inspect.isfunction(m) and (
	283	not hasattr(m, "__self__") or m.__self__ is None
	284	):
	285	# instance method
	286	return m.__get__(fs, cls)
	287	return m # class method or attribute
	288	else:
	289	# attributes of the superclass, while target is being set up
	290	return super().__getattribute__(item)
	291
	292
	293	class WholeFileCacheFileSystem(CachingFileSystem):
	294	"""Caches whole remote files on first access
	295
	296	This class is intended as a layer over any other file system, and
	297	will make a local copy of each file accessed, so that all subsequent
	298	reads are local. This is similar to ``CachingFileSystem``, but without
	299	the block-wise functionality and so can work even when sparse files
	300	are not allowed. See its docstring for definition of the init
	301	arguments.
	302
	303	The class still needs access to the remote store for listing files,
	304	and may refresh cached files.
	305	"""
	306
	307	protocol = "filecache"
	308
	309	def _open(self, path, mode="rb", **kwargs):
	310	path = self._strip_protocol(path)
	311	if not path.startswith(self.protocol):
	312	path = self.protocol + "://" + path
	313	if mode != "rb":
	314	return self.fs._open(path, mode=mode, **kwargs)
	315	detail, fn = self._check_file(path)
	316	if detail:
	317	hash, blocks = detail["fn"], detail["blocks"]
	318	if blocks is True:
	319	logger.debug("Opening local copy of %s" % path)
	320	return open(fn, "rb")
	321	else:
	322	raise ValueError(
	323	"Attempt to open partially cached file %s"
	324	"as a wholly cached file" % path
	325	)
	326	else:
	327	hash = hashlib.sha256(path.encode()).hexdigest()
	328	fn = os.path.join(self.storage[-1], hash)
	329	blocks = True
	330	detail = {
	331	"fn": hash,
	332	"blocks": blocks,
	333	"time": time.time(),
	334	"uid": self.fs.ukey(path),
	335	}
	336	self.cached_files[-1][path] = detail
	337	logger.debug("Copying %s to local cache" % path)
	338	kwargs["mode"] = mode
	339
	340	# call target filesystems open
	341	# TODO: why not just use fs.get ??
	342	f = self.fs._open(path, **kwargs)
	343	with open(fn, "wb") as f2:
	344	if isinstance(f, AbstractBufferedFile):
	345	# want no type of caching if just downloading whole thing
	346	f.cache = BaseCache(0, f.cache.fetcher, f.size)
	347	if getattr(f, "blocksize", 0) and f.size:
	348	# opportunity to parallelise here
	349	data = True
	350	while data:
	351	data = f.read(f.blocksize)
	352	f2.write(data)
	353	else:
	354	# this only applies to HTTP, should instead use streaming
	355	f2.write(f.read())
	356	self.save_cache()
	357	return self._open(path, mode)

+120

-0

fsspec/implementations/dask.py less more

	0	from distributed.worker import get_worker
	1	from distributed.client import _get_global_client
	2	import dask
	3	from fsspec.spec import AbstractFileSystem, AbstractBufferedFile
	4	from fsspec import filesystem
	5
	6
	7	def make_instance(cls, args, kwargs):
	8	inst = cls(args, *kwargs)
	9	inst._determine_worker()
	10	return inst
	11
	12
	13	class DaskWorkerFileSystem(AbstractFileSystem):
	14	"""View files accessible to a worker as any other remote file-system
	15
	16	When instances are run on the worker, uses the real filesystem. When
	17	run on the client, they call the worker to provide information or data.
	18
	19	Warning this implementation is experimental, and read-only for now.
	20	"""
	21
	22	def __init__(self, remote_protocol, remote_options=None, **kwargs):
	23	super().__init__(**kwargs)
	24	self.protocol = remote_protocol
	25	self.remote_options = remote_options
	26	self.worker = None
	27	self.client = None
	28	self.fs = None
	29	self._determine_worker()
	30
	31	def _determine_worker(self):
	32	try:
	33	get_worker()
	34	self.worker = True
	35	self.fs = filesystem(self.protocol, **(self.remote_options or {}))
	36	except ValueError:
	37	self.worker = False
	38	self.client = _get_global_client()
	39	self.rfs = dask.delayed(self)
	40
	41	def __reduce__(self):
	42	return make_instance, (type(self), self.storage_args, self.storage_options)
	43
	44	def mkdir(self, args, *kwargs):
	45	if self.worker:
	46	self.fs.mkdir(args, *kwargs)
	47	else:
	48	self.rfs.mkdir(args, *kwargs).compute()
	49
	50	def rm(self, args, *kwargs):
	51	if self.worker:
	52	self.fs.rm(args, *kwargs)
	53	else:
	54	self.rfs.rm(args, *kwargs).compute()
	55
	56	def copy(self, args, *kwargs):
	57	if self.worker:
	58	self.fs.copy(args, *kwargs)
	59	else:
	60	self.rfs.copy(args, *kwargs).compute()
	61
	62	def mv(self, args, *kwargs):
	63	if self.worker:
	64	self.fs.mv(args, *kwargs)
	65	else:
	66	self.rfs.mv(args, *kwargs).compute()
	67
	68	def ls(self, args, *kwargs):
	69	if self.worker:
	70	return self.fs.ls(args, *kwargs)
	71	else:
	72	return self.rfs.ls(args, *kwargs).compute()
	73
	74	def _open(self, path, mode="rb", **kwargs):
	75	if self.worker:
	76	return self.fs._open(path, mode=mode)
	77	else:
	78	return DaskFile(self, path, mode, **kwargs)
	79
	80	def fetch_range(self, path, mode, start, end):
	81	if self.worker:
	82	with self._open(path, mode) as f:
	83	f.seek(start)
	84	return f.read(end - start)
	85	else:
	86	return self.rfs.fetch_range(path, mode, start, end).compute()
	87
	88
	89	class DaskFile(AbstractBufferedFile):
	90	def __init__(
	91	self,
	92	fs,
	93	path,
	94	mode="rb",
	95	block_size="default",
	96	autocommit=True,
	97	cache_type="bytes",
	98	**kwargs
	99	):
	100	super().__init__(
	101	fs,
	102	path,
	103	mode=mode,
	104	block_size=block_size,
	105	autocommit=autocommit,
	106	cache_type=cache_type,
	107	**kwargs
	108	)
	109
	110	def _upload_chunk(self, final=False):
	111	pass
	112
	113	def _initiate_upload(self):
	114	""" Create remote file/upload """
	115	pass
	116
	117	def _fetch_range(self, start, end):
	118	"""Get the specified set of bytes from remote"""
	119	return self.fs.fetch_range(self.path, self.mode, start, end)

+265

-0

fsspec/implementations/ftp.py less more

	0	from ftplib import FTP, Error, error_perm
	1	from socket import timeout
	2	import uuid
	3	from ..spec import AbstractBufferedFile, AbstractFileSystem
	4	from ..utils import infer_storage_options
	5
	6
	7	class FTPFileSystem(AbstractFileSystem):
	8	"""A filesystem over classic """
	9
	10	root_marker = "/"
	11	cachable = False
	12
	13	def __init__(
	14	self,
	15	host,
	16	port=21,
	17	username=None,
	18	password=None,
	19	acct=None,
	20	block_size=None,
	21	tempdir="/tmp",
	22	timeout=30,
	23	**kwargs
	24	):
	25	"""
	26	You can use _get_kwargs_from_urls to get some kwargs from
	27	a reasonable FTP url.
	28
	29	Authentication will be anonymous if username/password are not
	30	given.
	31
	32	Parameters
	33	----------
	34	host: str
	35	The remote server name/ip to connect to
	36	port: int
	37	Port to connect with
	38	username: str or None
	39	If authenticating, the user's identifier
	40	password: str of None
	41	User's password on the server, if using
	42	acct: str or None
	43	Some servers also need an "account" string for auth
	44	block_size: int or None
	45	If given, the read-ahead or write buffer size.
	46	tempdir: str
	47	Directory on remote to put temporary files when in a transaction
	48	"""
	49	super(FTPFileSystem, self).__init__(**kwargs)
	50	self.host = host
	51	self.port = port
	52	self.tempdir = tempdir
	53	self.cred = username, password, acct
	54	self.timeout = timeout
	55	if block_size is not None:
	56	self.blocksize = block_size
	57	else:
	58	self.blocksize = 2 ** 16
	59	self._connect()
	60
	61	def _connect(self):
	62	self.ftp = FTP(timeout=self.timeout)
	63	self.ftp.connect(self.host, self.port)
	64	self.ftp.login(*self.cred)
	65
	66	@classmethod
	67	def _strip_protocol(cls, path):
	68	return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
	69
	70	@staticmethod
	71	def _get_kwargs_from_urls(urlpath):
	72	out = infer_storage_options(urlpath)
	73	out.pop("path", None)
	74	out.pop("protocol", None)
	75	return out
	76
	77	def invalidate_cache(self, path=None):
	78	if path is not None:
	79	self.dircache.pop(path, None)
	80	else:
	81	self.dircache.clear()
	82
	83	def ls(self, path, detail=True):
	84	path = self._strip_protocol(path)
	85	out = []
	86	if path not in self.dircache:
	87	try:
	88	try:
	89	out = [
	90	(fn, details)
	91	for (fn, details) in self.ftp.mlsd(path)
	92	if fn not in [".", ".."]
	93	and details["type"] not in ["pdir", "cdir"]
	94	]
	95	except error_perm:
	96	out = _mlsd2(self.ftp, path) # Not platform independent
	97	for fn, details in out:
	98	if path == "/":
	99	path = "" # just for forming the names, below
	100	details["name"] = "/".join([path, fn.lstrip("/")])
	101	if details["type"] == "file":
	102	details["size"] = int(details["size"])
	103	else:
	104	details["size"] = 0
	105	self.dircache[path] = out
	106	except Error:
	107	try:
	108	info = self.info(path)
	109	if info["type"] == "file":
	110	out = [(path, info)]
	111	except (Error, IndexError):
	112	raise FileNotFoundError
	113	files = self.dircache.get(path, out)
	114	if not detail:
	115	return sorted([fn for fn, details in files])
	116	return [details for fn, details in files]
	117
	118	def info(self, path, **kwargs):
	119	# implement with direct method
	120	path = self._strip_protocol(path)
	121	files = self.ls(self._parent(path).lstrip("/"), True)
	122	try:
	123	out = [f for f in files if f["name"] == path][0]
	124	except IndexError:
	125	raise FileNotFoundError(path)
	126	return out
	127
	128	def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs):
	129	path = self._strip_protocol(path)
	130	block_size = block_size or self.blocksize
	131	return FTPFile(
	132	self,
	133	path,
	134	mode=mode,
	135	block_size=block_size,
	136	tempdir=self.tempdir,
	137	autocommit=autocommit,
	138	)
	139
	140	def _rm(self, path):
	141	path = self._strip_protocol(path)
	142	self.ftp.delete(path)
	143	self.invalidate_cache(path.rsplit("/", 1)[0])
	144
	145	def mkdir(self, path, **kwargs):
	146	path = self._strip_protocol(path)
	147	self.ftp.mkd(path)
	148
	149	def rmdir(self, path):
	150	path = self._strip_protocol(path)
	151	self.ftp.rmd(path)
	152
	153	def mv(self, path1, path2, **kwargs):
	154	path1 = self._strip_protocol(path1)
	155	path2 = self._strip_protocol(path2)
	156	self.ftp.rename(path1, path2)
	157	self.invalidate_cache(self._parent(path1))
	158	self.invalidate_cache(self._parent(path2))
	159
	160	def __del__(self):
	161	self.ftp.close()
	162
	163
	164	class TransferDone(Exception):
	165	"""Internal exception to break out of transfer"""
	166
	167	pass
	168
	169
	170	class FTPFile(AbstractBufferedFile):
	171	"""Interact with a remote FTP file with read/write buffering"""
	172
	173	def __init__(self, fs, path, **kwargs):
	174	super().__init__(fs, path, **kwargs)
	175	if kwargs.get("autocommit", False) is False:
	176	self.target = self.path
	177	self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
	178
	179	def commit(self):
	180	self.fs.mv(self.path, self.target)
	181
	182	def discard(self):
	183	self.fs.rm(self.path)
	184
	185	def _fetch_range(self, start, end):
	186	"""Get bytes between given byte limits
	187
	188	Implemented by raising an exception in the fetch callback when the
	189	number of bytes received reaches the requested amount.
	190
	191	Will fail if the server does not respect the REST command on
	192	retrieve requests.
	193	"""
	194	out = []
	195	total = [0]
	196
	197	def callback(x):
	198	total[0] += len(x)
	199	if total[0] > end - start:
	200	out.append(x[: (end - start) - total[0]])
	201	raise TransferDone
	202	else:
	203	out.append(x)
	204
	205	if total[0] == end - start:
	206	raise TransferDone
	207
	208	try:
	209	self.fs.ftp.retrbinary(
	210	"RETR %s" % self.path,
	211	blocksize=self.blocksize,
	212	rest=start,
	213	callback=callback,
	214	)
	215	except TransferDone:
	216	try:
	217	self.fs.ftp.abort()
	218	self.fs.ftp.voidresp()
	219	except timeout:
	220	self.fs._connect()
	221	return b"".join(out)
	222
	223	def _upload_chunk(self, final=False):
	224	self.buffer.seek(0)
	225	self.fs.ftp.storbinary(
	226	"STOR " + self.path, self.buffer, blocksize=self.blocksize, rest=self.offset
	227	)
	228	return True
	229
	230
	231	def _mlsd2(ftp, path="."):
	232	"""
	233	Fall back to using `dir` instead of `mlsd` if not supported.
	234
	235	This parses a Linux style `ls -l` response to `dir`, but the response may
	236	be platform dependent.
	237
	238	Parameters
	239	----------
	240	ftp: ftplib.FTP
	241	path: str
	242	Expects to be given path, but defaults to ".".
	243	"""
	244	lines = []
	245	minfo = []
	246	ftp.dir(path, lines.append)
	247	for line in lines:
	248	line = line.split()
	249	this = (
	250	line[-1],
	251	{
	252	"modify": " ".join(line[5:8]),
	253	"unix.owner": line[2],
	254	"unix.group": line[3],
	255	"unix.mode": line[0],
	256	"size": line[4],
	257	},
	258	)
	259	if "d" == this[1]["unix.mode"][0]:
	260	this[1]["type"] = "dir"
	261	else:
	262	this[1]["type"] = "file"
	263	minfo.append(this)
	264	return minfo

+67

-0

fsspec/implementations/github.py less more

	0	import io
	1	import requests
	2	from ..spec import AbstractFileSystem
	3
	4
	5	class GithubFileSystem(AbstractFileSystem):
	6	"""[Experimental] interface to files in github
	7
	8	An instance of this class provides the files residing within a remote github
	9	repository. You may specify a point in the repos history, by SHA, branch
	10	or tag (default is current master).
	11
	12	Given that code files tend to be small, and that github does not support
	13	retrieving partial content, we always fetch whole files.
	14	"""
	15
	16	url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
	17	rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
	18	protocol = "github"
	19
	20	def __init__(self, org, repo, sha="master", **kwargs):
	21	super().__init__(**kwargs)
	22	self.org = org
	23	self.repo = repo
	24	self.root = sha
	25	self.ls("")
	26
	27	def ls(self, path, detail=False, sha=None, **kwargs):
	28	if path == "":
	29	sha = self.root
	30	if sha is None:
	31	parts = path.rstrip("/").split("/")
	32	so_far = ""
	33	sha = self.root
	34	for part in parts:
	35	out = self.ls(so_far, True, sha=sha)
	36	so_far += "/" + part if so_far else part
	37	out = [o for o in out if o["name"] == so_far][0]
	38	if out["type"] == "file":
	39	if detail:
	40	return [out]
	41	else:
	42	return path
	43	sha = out["sha"]
	44	if path not in self.dircache:
	45	r = requests.get(self.url.format(org=self.org, repo=self.repo, sha=sha))
	46	self.dircache[path] = [
	47	{
	48	"name": path + "/" + f["path"] if path else f["path"],
	49	"mode": f["mode"],
	50	"type": {"blob": "file", "tree": "directory"}[f["type"]],
	51	"size": f.get("size", 0),
	52	"sha": f["sha"],
	53	}
	54	for f in r.json()["tree"]
	55	]
	56	if detail:
	57	return self.dircache[path]
	58	else:
	59	return sorted([f["name"] for f in self.dircache[path]])
	60
	61	def _open(self, path, mode="rb", **kwargs):
	62	if mode != "rb":
	63	raise NotImplementedError
	64	url = self.rurl.format(org=self.org, repo=self.repo, path=path, sha=self.root)
	65	r = requests.get(url)
	66	return io.BytesIO(r.content)

+192

-0

fsspec/implementations/hdfs.py less more

	0	from ..spec import AbstractFileSystem
	1	from ..utils import infer_storage_options
	2	from pyarrow.hdfs import HadoopFileSystem
	3
	4
	5	class PyArrowHDFS(AbstractFileSystem):
	6	"""Adapted version of Arrow's HadoopFileSystem
	7
	8	This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which
	9	passes on all calls to the underlying class.
	10	"""
	11
	12	def __init__(
	13	self,
	14	host="default",
	15	port=0,
	16	user=None,
	17	kerb_ticket=None,
	18	driver="libhdfs",
	19	extra_conf=None,
	20	**kwargs
	21	):
	22	"""
	23
	24	Parameters
	25	----------
	26	host: str
	27	Hostname, IP or "default" to try to read from Hadoop config
	28	port: int
	29	Port to connect on, or default from Hadoop config if 0
	30	user: str or None
	31	If given, connect as this username
	32	kerb_ticket: str or None
	33	If given, use this ticket for authentication
	34	driver: 'libhdfs' or 'libhdfs3'
	35	Binary driver; libhdfs if the JNI library and default
	36	extra_conf: None or dict
	37	Passed on to HadoopFileSystem
	38	"""
	39	if self._cached:
	40	return
	41	AbstractFileSystem.__init__(self, **kwargs)
	42	self.pars = (host, port, user, kerb_ticket, driver, extra_conf)
	43	self.pahdfs = HadoopFileSystem(
	44	host=host,
	45	port=port,
	46	user=user,
	47	kerb_ticket=kerb_ticket,
	48	driver=driver,
	49	extra_conf=extra_conf,
	50	)
	51
	52	def _open(self, path, mode="rb", block_size=None, autocommit=True, **kwargs):
	53	"""
	54
	55	Parameters
	56	----------
	57	path: str
	58	Location of file; should start with '/'
	59	mode: str
	60	block_size: int
	61	Hadoop block size, e.g., 2**26
	62	autocommit: True
	63	Transactions are not yet implemented for HDFS; errors if not True
	64	kwargs: dict or None
	65	Hadoop config parameters
	66
	67	Returns
	68	-------
	69	HDFSFile file-like instance
	70	"""
	71	if not autocommit:
	72	raise NotImplementedError
	73	return HDFSFile(self, path, mode, block_size, **kwargs)
	74
	75	def __reduce_ex__(self, protocol):
	76	return PyArrowHDFS, self.pars
	77
	78	def ls(self, path, detail=True):
	79	out = self.pahdfs.ls(path, detail)
	80	if detail:
	81	for p in out:
	82	p["type"] = p["kind"]
	83	p["name"] = self._strip_protocol(p["name"])
	84	else:
	85	out = [self._strip_protocol(p) for p in out]
	86	return out
	87
	88	@staticmethod
	89	def _get_kwargs_from_urls(paths):
	90	ops = infer_storage_options(paths)
	91	out = {}
	92	if ops.get("host", None):
	93	out["host"] = ops["host"]
	94	if ops.get("username", None):
	95	out["user"] = ops["username"]
	96	if ops.get("port", None):
	97	out["port"] = ops["port"]
	98	return out
	99
	100	@classmethod
	101	def _strip_protocol(cls, path):
	102	ops = infer_storage_options(path)
	103	return ops["path"]
	104
	105	def __getattribute__(self, item):
	106	if item in [
	107	"_open",
	108	"__init__",
	109	"__getattribute__",
	110	"__reduce_ex__",
	111	"open",
	112	"ls",
	113	"makedirs",
	114	]:
	115	# all the methods defined in this class. Note `open` here, since
	116	# it calls `_open`, but is actually in superclass
	117	return lambda args, kw: getattr(PyArrowHDFS, item)(self, args, **kw)
	118	if item == "__class__":
	119	return PyArrowHDFS
	120	d = object.__getattribute__(self, "__dict__")
	121	pahdfs = d.get("pahdfs", None) # fs is not immediately defined
	122	if pahdfs is not None and item in [
	123	"chmod",
	124	"chown",
	125	"user",
	126	"df",
	127	"disk_usage",
	128	"download",
	129	"driver",
	130	"exists",
	131	"extra_conf",
	132	"get_capacity",
	133	"get_space_used",
	134	"host",
	135	"is_open",
	136	"kerb_ticket",
	137	"strip_protocol",
	138	"mkdir",
	139	"mv",
	140	"port",
	141	"get_capacity",
	142	"get_space_used",
	143	"df",
	144	"chmod",
	145	"chown",
	146	"disk_usage",
	147	"download",
	148	"upload",
	149	"_get_kwargs_from_urls",
	150	"read_parquet",
	151	"rm",
	152	"stat",
	153	"upload",
	154	]:
	155	return getattr(pahdfs, item)
	156	else:
	157	# attributes of the superclass, while target is being set up
	158	return super().__getattribute__(item)
	159
	160
	161	class HDFSFile(object):
	162	"""Wrapper around arrow's HdfsFile
	163
	164	Allows seek beyond EOF and (eventually) commit/discard
	165	"""
	166
	167	def __init__(self, fs, path, mode, block_size, **kwargs):
	168	self.fs = fs
	169	self.path = path
	170	self.mode = mode
	171	self.block_size = block_size
	172	self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs)
	173	if self.fh.readable():
	174	self.seek_size = self.size()
	175
	176	def seek(self, loc, whence=0):
	177	if whence == 0 and self.readable():
	178	loc = min(loc, self.seek_size)
	179	return self.fh.seek(loc, whence)
	180
	181	def __getattr__(self, item):
	182	return getattr(self.fh, item)
	183
	184	def __reduce_ex__(self, protocol):
	185	return HDFSFile, (self.fs, self.path, self.mode, self.block_size)
	186
	187	def __enter__(self):
	188	return self
	189
	190	def __exit__(self, exc_type, exc_val, exc_tb):
	191	self.close()

+358

-0

fsspec/implementations/http.py less more

	0	from __future__ import print_function, division, absolute_import
	1
	2	import re
	3	import requests
	4	from urllib.parse import urlparse
	5	from fsspec import AbstractFileSystem
	6	from fsspec.spec import AbstractBufferedFile
	7	from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE
	8
	9	# https://stackoverflow.com/a/15926317/3821154
	10	ex = re.compile(r"""<a\s+(?:[^>]?\s+)?href=(["'])(.?)\1""")
	11	ex2 = re.compile(r"""(http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
	12
	13
	14	class HTTPFileSystem(AbstractFileSystem):
	15	"""
	16	Simple File-System for fetching data via HTTP(S)
	17
	18	``ls()`` is implemented by loading the parent page and doing a regex
	19	match on the result. If simple_link=True, anything of the form
	20	"http(s)://server.com/stuff?thing=other"; otherwise only links within
	21	HTML href tags will be used.
	22	"""
	23
	24	sep = "/"
	25
	26	def __init__(
	27	self,
	28	simple_links=True,
	29	block_size=None,
	30	same_scheme=True,
	31	size_policy=None,
	32	**storage_options
	33	):
	34	"""
	35	Parameters
	36	----------
	37	block_size: int
	38	Blocks to read bytes; if 0, will default to raw requests file-like
	39	objects instead of HTTPFile instances
	40	simple_links: bool
	41	If True, will consider both HTML <a> tags and anything that looks
	42	like a URL; if False, will consider only the former.
	43	same_scheme: True
	44	When doing ls/glob, if this is True, only consider paths that have
	45	http/https matching the input URLs.
	46	size_policy: this argument is deprecated
	47	storage_options: key-value
	48	May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
	49	other parameters passed on to requests
	50	"""
	51	AbstractFileSystem.__init__(self)
	52	self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
	53	self.simple_links = simple_links
	54	self.same_schema = same_scheme
	55	self.kwargs = storage_options
	56	self.session = requests.Session()
	57
	58	@classmethod
	59	def _strip_protocol(cls, path):
	60	""" For HTTP, we always want to keep the full URL
	61	"""
	62	return path
	63
	64	# TODO: override get
	65
	66	def ls(self, url, detail=True):
	67	# ignoring URL-encoded arguments
	68	r = self.session.get(url, **self.kwargs)
	69	if self.simple_links:
	70	links = ex2.findall(r.text) + ex.findall(r.text)
	71	else:
	72	links = ex.findall(r.text)
	73	out = set()
	74	parts = urlparse(url)
	75	for l in links:
	76	if isinstance(l, tuple):
	77	l = l[1]
	78	if l.startswith("http"):
	79	if self.same_schema:
	80	if l.split(":", 1)[0] == url.split(":", 1)[0]:
	81	out.add(l)
	82	elif l.replace("https", "http").startswith(
	83	url.replace("https", "http")
	84	):
	85	# allowed to cross http <-> https
	86	out.add(l)
	87	elif l.startswith("/") and len(l) > 1:
	88	out.add(parts.scheme + "://" + parts.netloc + l)
	89	else:
	90	if l not in ["..", "../"]:
	91	# Ignore FTP-like "parent"
	92	out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
	93	if not out and url.endswith("/"):
	94	return self.ls(url.rstrip("/"), detail=True)
	95	if detail:
	96	return [
	97	{
	98	"name": u,
	99	"size": None,
	100	"type": "directory" if u.endswith("/") else "file",
	101	}
	102	for u in out
	103	]
	104	else:
	105	return list(sorted(out))
	106
	107	def cat(self, url):
	108	r = requests.get(url, **self.kwargs)
	109	r.raise_for_status()
	110	return r.content
	111
	112	def mkdirs(self, url):
	113	"""Make any intermediate directories to make path writable"""
	114	raise NotImplementedError
	115
	116	def exists(self, path):
	117	kwargs = self.kwargs.copy()
	118	kwargs["stream"] = True
	119	try:
	120	r = self.session.get(path, **kwargs)
	121	r.close()
	122	return r.ok
	123	except requests.HTTPError:
	124	return False
	125
	126	def _open(self, url, mode="rb", block_size=None, cache_options=None, **kwargs):
	127	"""Make a file-like object
	128
	129	Parameters
	130	----------
	131	url: str
	132	Full URL with protocol
	133	mode: string
	134	must be "rb"
	135	block_size: int or None
	136	Bytes to download in one request; use instance value if None. If
	137	zero, will return a streaming Requests file-like instance.
	138	kwargs: key-value
	139	Any other parameters, passed to requests calls
	140	"""
	141	if mode != "rb":
	142	raise NotImplementedError
	143	block_size = block_size if block_size is not None else self.block_size
	144	kw = self.kwargs.copy()
	145	kw.update(kwargs)
	146	kw.pop("autocommit", None)
	147	if block_size:
	148	return HTTPFile(
	149	self, url, self.session, block_size, cache_options=cache_options, **kw
	150	)
	151	else:
	152	kw["stream"] = True
	153	r = self.session.get(url, **kw)
	154	r.raise_for_status()
	155	r.raw.decode_content = True
	156	return r.raw
	157
	158	def ukey(self, url):
	159	"""Unique identifier; assume HTTP files are static, unchanging"""
	160	return tokenize(url, self.kwargs, self.protocol)
	161
	162	def info(self, url, **kwargs):
	163	"""Get info of URL
	164
	165	Tries to access location via HEAD, and then GET methods, but does
	166	not fetch the data.
	167
	168	It is possible that the server does not supply any size information, in
	169	which case size will be given as None (and certain operations on the
	170	corresponding file will not work).
	171	"""
	172	size = False
	173	for policy in ["head", "get"]:
	174	try:
	175	size = file_size(url, self.session, policy, **self.kwargs)
	176	if size:
	177	break
	178	except Exception:
	179	pass
	180	else:
	181	# get failed, so conclude URL does not exist
	182	if size is False:
	183	raise FileNotFoundError(url)
	184	return {"name": url, "size": size or None, "type": "file"}
	185
	186
	187	class HTTPFile(AbstractBufferedFile):
	188	"""
	189	A file-like object pointing to a remove HTTP(S) resource
	190
	191	Supports only reading, with read-ahead of a predermined block-size.
	192
	193	In the case that the server does not supply the filesize, only reading of
	194	the complete file in one go is supported.
	195
	196	Parameters
	197	----------
	198	url: str
	199	Full URL of the remote resource, including the protocol
	200	session: requests.Session or None
	201	All calls will be made within this session, to avoid restarting
	202	connections where the server allows this
	203	block_size: int or None
	204	The amount of read-ahead to do, in bytes. Default is 5MB, or the value
	205	configured for the FileSystem creating this file
	206	size: None or int
	207	If given, this is the size of the file in bytes, and we don't attempt
	208	to call the server to find the value.
	209	kwargs: all other key-values are passed to requests calls.
	210	"""
	211
	212	def __init__(
	213	self,
	214	fs,
	215	url,
	216	session=None,
	217	block_size=None,
	218	mode="rb",
	219	cache_type="bytes",
	220	cache_options=None,
	221	size=None,
	222	**kwargs
	223	):
	224	if mode != "rb":
	225	raise NotImplementedError("File mode not supported")
	226	self.url = url
	227	self.session = session if session is not None else requests.Session()
	228	if size is not None:
	229	self.details = {"name": url, "size": size, "type": "file"}
	230	super().__init__(
	231	fs=fs,
	232	path=url,
	233	mode=mode,
	234	block_size=block_size,
	235	cache_type=cache_type,
	236	cache_options=cache_options,
	237	**kwargs
	238	)
	239	self.cache.size = self.size or self.blocksize
	240
	241	def read(self, length=-1):
	242	"""Read bytes from file
	243
	244	Parameters
	245	----------
	246	length: int
	247	Read up to this many bytes. If negative, read all content to end of
	248	file. If the server has not supplied the filesize, attempting to
	249	read only part of the data will raise a ValueError.
	250	"""
	251	if (
	252	(length < 0 and self.loc == 0)
	253	or (length > (self.size or length)) # explicit read all
	254	or ( # read more than there is
	255	self.size and self.size < self.blocksize
	256	) # all fits in one block anyway
	257	):
	258	self._fetch_all()
	259	if self.size is None:
	260	if length < 0:
	261	self._fetch_all()
	262	else:
	263	length = min(self.size - self.loc, length)
	264	return super().read(length)
	265
	266	def _fetch_all(self):
	267	"""Read whole file in one shot, without caching
	268
	269	This is only called when position is still at zero,
	270	and read() is called without a byte-count.
	271	"""
	272	if not isinstance(self.cache, AllBytes):
	273	r = self.session.get(self.url, **self.kwargs)
	274	r.raise_for_status()
	275	out = r.content
	276	self.cache = AllBytes(out)
	277	self.size = len(out)
	278
	279	def _fetch_range(self, start, end):
	280	"""Download a block of data
	281
	282	The expectation is that the server returns only the requested bytes,
	283	with HTTP code 206. If this is not the case, we first check the headers,
	284	and then stream the output - if the data size is bigger than we
	285	requested, an exception is raised.
	286	"""
	287	kwargs = self.kwargs.copy()
	288	headers = kwargs.pop("headers", {})
	289	headers["Range"] = "bytes=%i-%i" % (start, end - 1)
	290	r = self.session.get(self.url, headers=headers, stream=True, **kwargs)
	291	if r.status_code == 416:
	292	# range request outside file
	293	return b""
	294	r.raise_for_status()
	295	if r.status_code == 206:
	296	# partial content, as expected
	297	out = r.content
	298	elif "Content-Length" in r.headers:
	299	cl = int(r.headers["Content-Length"])
	300	if cl <= end - start:
	301	# data size OK
	302	out = r.content
	303	else:
	304	raise ValueError(
	305	"Got more bytes (%i) than requested (%i)" % (cl, end - start)
	306	)
	307	else:
	308	cl = 0
	309	out = []
	310	for chunk in r.iter_content(chunk_size=2 ** 20):
	311	# data size unknown, let's see if it goes too big
	312	if chunk:
	313	out.append(chunk)
	314	cl += len(chunk)
	315	if cl > end - start:
	316	raise ValueError(
	317	"Got more bytes so far (>%i) than requested (%i)"
	318	% (cl, end - start)
	319	)
	320	else:
	321	break
	322	out = b"".join(out)
	323	return out
	324
	325
	326	def file_size(url, session=None, size_policy="head", **kwargs):
	327	"""Call HEAD on the server to get file size
	328
	329	Default operation is to explicitly allow redirects and use encoding
	330	'identity' (no compression) to get the true size of the target.
	331	"""
	332	kwargs = kwargs.copy()
	333	ar = kwargs.pop("allow_redirects", True)
	334	head = kwargs.get("headers", {}).copy()
	335	head["Accept-Encoding"] = "identity"
	336	session = session or requests.Session()
	337	if size_policy == "head":
	338	r = session.head(url, allow_redirects=ar, **kwargs)
	339	elif size_policy == "get":
	340	kwargs["stream"] = True
	341	r = session.get(url, allow_redirects=ar, **kwargs)
	342	else:
	343	raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy)
	344	if "Content-Length" in r.headers:
	345	return int(r.headers["Content-Length"])
	346	elif "Content-Range" in r.headers:
	347	return int(r.headers["Content-Range"].split("/")[1])
	348
	349
	350	class AllBytes(object):
	351	"""Cache entire contents of a remote URL"""
	352
	353	def __init__(self, data):
	354	self.data = data
	355
	356	def _fetch(self, start, end):
	357	return self.data[start:end]

+233

-0

fsspec/implementations/local.py less more

	0	import io
	1	import os
	2	import shutil
	3	import posixpath
	4	import re
	5	import tempfile
	6	from fsspec import AbstractFileSystem
	7	from fsspec.utils import stringify_path
	8
	9
	10	class LocalFileSystem(AbstractFileSystem):
	11	"""Interface to files on local storage
	12
	13	Parameters
	14	----------
	15	auto_mkdirs: bool
	16	Whether, when opening a file, the directory containing it should
	17	be created (if it doesn't already exist). This is assumed by pyarrow
	18	code.
	19	"""
	20
	21	root_marker = "/"
	22
	23	def __init__(self, auto_mkdir=True, **kwargs):
	24	super().__init__(**kwargs)
	25	self.auto_mkdir = auto_mkdir
	26
	27	def mkdir(self, path, create_parents=True, **kwargs):
	28	path = self._strip_protocol(path)
	29	if create_parents:
	30	self.makedirs(path, exist_ok=True)
	31	else:
	32	os.mkdir(path, **kwargs)
	33
	34	def makedirs(self, path, exist_ok=False):
	35	path = self._strip_protocol(path)
	36	os.makedirs(path, exist_ok=exist_ok)
	37
	38	def rmdir(self, path):
	39	os.rmdir(path)
	40
	41	def ls(self, path, detail=False):
	42	path = self._strip_protocol(path)
	43	paths = [posixpath.join(path, f) for f in os.listdir(path)]
	44	if detail:
	45	return [self.info(f) for f in paths]
	46	else:
	47	return paths
	48
	49	def glob(self, path, **kargs):
	50	path = self._strip_protocol(path)
	51	return super().glob(path)
	52
	53	def info(self, path, **kwargs):
	54	path = self._strip_protocol(path)
	55	out = os.stat(path, follow_symlinks=False)
	56	dest = False
	57	if os.path.islink(path):
	58	t = "link"
	59	dest = os.readlink(path)
	60	elif os.path.isdir(path):
	61	t = "directory"
	62	elif os.path.isfile(path):
	63	t = "file"
	64	else:
	65	t = "other"
	66	result = {"name": path, "size": out.st_size, "type": t, "created": out.st_ctime}
	67	for field in ["mode", "uid", "gid", "mtime"]:
	68	result[field] = getattr(out, "st_" + field)
	69	if dest:
	70	result["destination"] = dest
	71	try:
	72	out2 = os.stat(path, follow_symlinks=True)
	73	result["size"] = out2.st_size
	74	except IOError:
	75	result["size"] = 0
	76	return result
	77
	78	def copy(self, path1, path2, **kwargs):
	79	shutil.copyfile(path1, path2)
	80
	81	def get(self, path1, path2, **kwargs):
	82	if kwargs.get("recursive"):
	83	return super(LocalFileSystem, self).get(path1, path2, **kwargs)
	84	else:
	85	return self.copy(path1, path2, **kwargs)
	86
	87	def put(self, path1, path2, **kwargs):
	88	if kwargs.get("recursive"):
	89	return super(LocalFileSystem, self).put(path1, path2, **kwargs)
	90	else:
	91	return self.copy(path1, path2, **kwargs)
	92
	93	def mv(self, path1, path2, **kwargs):
	94	os.rename(path1, path2)
	95
	96	def rm(self, path, recursive=False, maxdepth=None):
	97	if recursive and self.isdir(path):
	98	shutil.rmtree(path)
	99	else:
	100	os.remove(path)
	101
	102	def _open(self, path, mode="rb", block_size=None, **kwargs):
	103	path = self._strip_protocol(path)
	104	if self.auto_mkdir:
	105	self.makedirs(self._parent(path), exist_ok=True)
	106	return LocalFileOpener(path, mode, fs=self, **kwargs)
	107
	108	def touch(self, path, **kwargs):
	109	path = self._strip_protocol(path)
	110	if self.exists(path):
	111	os.utime(path, None)
	112	else:
	113	open(path, "a").close()
	114
	115	@classmethod
	116	def _parent(cls, path):
	117	path = cls._strip_protocol(path).rstrip("/")
	118	if "/" in path:
	119	return path.rsplit("/", 1)[0]
	120	else:
	121	return cls.root_marker
	122
	123	@classmethod
	124	def _strip_protocol(cls, path):
	125	path = stringify_path(path)
	126	if path.startswith("file://"):
	127	path = path[7:]
	128	return make_path_posix(path)
	129
	130
	131	def make_path_posix(path, sep=os.sep):
	132	""" Make path generic """
	133	if re.match("/[A-Za-z]:", path):
	134	# for windows file URI like "file:///C:/folder/file"
	135	# or "file:///C:\\dir\\file"
	136	path = path[1:]
	137	if path.startswith("\\\\"):
	138	# special case for windows UNC/DFS-style paths, do nothing,
	139	# jsut flip the slashes around (case below does not work!)
	140	return path.replace("\\", "/")
	141	if path.startswith("\\") or re.match("[\\\\]*[A-Za-z]:", path):
	142	# windows full path "\\server\\path" or "C:\\local\\path"
	143	return path.lstrip("\\").replace("\\", "/").replace("//", "/")
	144	if (
	145	sep not in path
	146	and "/" not in path
	147	or (sep == "/" and not path.startswith("/"))
	148	or (sep == "\\" and ":" not in path)
	149	):
	150	# relative path like "path" or "rel\\path" (win) or rel/path"
	151	path = os.path.abspath(path)
	152	if os.sep == "\\":
	153	# abspath made some more '\\' separators
	154	return make_path_posix(path, sep)
	155	return path
	156
	157
	158	class LocalFileOpener(object):
	159	def __init__(self, path, mode, autocommit=True, fs=None, **kwargs):
	160	self.path = path
	161	self.mode = mode
	162	self.fs = fs
	163	self.f = None
	164	self.autocommit = autocommit
	165	self.blocksize = io.DEFAULT_BUFFER_SIZE
	166	self._open()
	167
	168	def _open(self):
	169	if self.f is None or self.f.closed:
	170	if self.autocommit or "w" not in self.mode:
	171	self.f = open(self.path, mode=self.mode)
	172	else:
	173	# TODO: check if path is writable?
	174	i, name = tempfile.mkstemp()
	175	self.temp = name
	176	self.f = open(name, mode=self.mode)
	177	if "w" not in self.mode:
	178	self.details = self.fs.info(self.path)
	179	self.size = self.details["size"]
	180	self.f.size = self.size
	181
	182	def _fetch_range(self, start, end):
	183	# probably only used by cached FS
	184	if "r" not in self.mode:
	185	raise ValueError
	186	self._open()
	187	self.f.seek(start)
	188	return self.f.read(end - start)
	189
	190	def __setstate__(self, state):
	191	if "r" in state["mode"]:
	192	loc = self.state.pop("loc")
	193	self._open()
	194	self.f.seek(loc)
	195	else:
	196	self.f = None
	197	self.__dict__.update(state)
	198
	199	def __getstate__(self):
	200	d = self.__dict__.copy()
	201	d.pop("f")
	202	if "r" in self.mode:
	203	d["loc"] = self.f.tell()
	204	else:
	205	if not self.f.closed:
	206	raise ValueError("Cannot serialise open write-mode local file")
	207	return d
	208
	209	def commit(self):
	210	if self.autocommit:
	211	raise RuntimeError("Can only commit if not already set to autocommit")
	212	os.rename(self.temp, self.path)
	213
	214	def discard(self):
	215	if self.autocommit:
	216	raise RuntimeError("Cannot discard if set to autocommit")
	217	os.remove(self.temp)
	218
	219	def __fspath__(self):
	220	# uniquely for fsspec implementations, this is a real path
	221	return self.path
	222
	223	def __getattr__(self, item):
	224	return getattr(self.f, item)
	225
	226	def __enter__(self):
	227	self._incontext = True
	228	return self.f.__enter__()
	229
	230	def __exit__(self, exc_type, exc_value, traceback):
	231	self._incontext = False
	232	self.f.__exit__(exc_type, exc_value, traceback)

+169

-0

fsspec/implementations/memory.py less more

	0	from __future__ import print_function, division, absolute_import
	1
	2	from io import BytesIO
	3	from fsspec import AbstractFileSystem
	4	import logging
	5
	6	logger = logging.Logger("fsspec.memoryfs")
	7
	8
	9	class MemoryFileSystem(AbstractFileSystem):
	10	"""A filesystem based on a dict of BytesIO objects"""
	11
	12	store = {} # global
	13	pseudo_dirs = []
	14	protocol = "memory"
	15	root_marker = ""
	16
	17	def ls(self, path, detail=False):
	18	if path in self.store:
	19	# there is a key with this exact name, but could also be directory
	20	out = [
	21	{
	22	"name": path,
	23	"size": self.store[path].getbuffer().nbytes,
	24	"type": "file",
	25	}
	26	]
	27	else:
	28	out = []
	29	path = path.strip("/").lstrip("/")
	30	paths = set()
	31	for p2 in self.store:
	32	has_slash = "/" if p2.startswith("/") else ""
	33	p = p2.lstrip("/")
	34	if "/" in p:
	35	root = p.rsplit("/", 1)[0]
	36	else:
	37	root = ""
	38	if root == path:
	39	out.append(
	40	{
	41	"name": has_slash + p,
	42	"size": self.store[p2].getbuffer().nbytes,
	43	"type": "file",
	44	}
	45	)
	46	elif path and all(
	47	(a == b) for a, b in zip(path.split("/"), p.strip("/").split("/"))
	48	):
	49	# implicit directory
	50	ppath = "/".join(p.split("/")[: len(path.split("/")) + 1])
	51	if ppath not in paths:
	52	out.append(
	53	{
	54	"name": has_slash + ppath + "/",
	55	"size": 0,
	56	"type": "directory",
	57	}
	58	)
	59	paths.add(ppath)
	60	elif all(
	61	(a == b)
	62	for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
	63	):
	64	# root directory entry
	65	ppath = p.rstrip("/").split("/", 1)[0]
	66	if ppath not in paths:
	67	out.append(
	68	{
	69	"name": has_slash + ppath + "/",
	70	"size": 0,
	71	"type": "directory",
	72	}
	73	)
	74	paths.add(ppath)
	75	for p2 in self.pseudo_dirs:
	76	if self._parent(p2).strip("/").rstrip("/") == path:
	77	out.append({"name": p2 + "/", "size": 0, "type": "directory"})
	78	if detail:
	79	return out
	80	return sorted([f["name"] for f in out])
	81
	82	def mkdir(self, path):
	83	path = path.rstrip("/")
	84	if path not in self.pseudo_dirs:
	85	self.pseudo_dirs.append(path)
	86
	87	def rmdir(self, path):
	88	path = path.rstrip("/")
	89	if path in self.pseudo_dirs:
	90	if self.ls(path) == []:
	91	self.pseudo_dirs.remove(path)
	92	else:
	93	raise OSError("Directory %s not empty" % path)
	94	else:
	95	raise FileNotFoundError(path)
	96
	97	def exists(self, path):
	98	return path in self.store
	99
	100	def _open(self, path, mode="rb", **kwargs):
	101	"""Make a file-like object
	102
	103	Parameters
	104	----------
	105	path: str
	106	identifier
	107	mode: str
	108	normally "rb", "wb" or "ab"
	109	"""
	110	if mode in ["rb", "ab", "rb+"]:
	111	if path in self.store:
	112	f = self.store[path]
	113	if mode == "rb":
	114	f.seek(0)
	115	else:
	116	f.seek(0, 2)
	117	return f
	118	else:
	119	raise FileNotFoundError(path)
	120	if mode == "wb":
	121	m = MemoryFile(self, path)
	122	if not self._intrans:
	123	m.commit()
	124	return m
	125
	126	def copy(self, path1, path2, **kwargs):
	127	self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer())
	128
	129	def cat(self, path):
	130	return self.store[path].getvalue()
	131
	132	def _rm(self, path):
	133	del self.store[path]
	134
	135	def size(self, path):
	136	"""Size in bytes of the file at path"""
	137	if path not in self.store:
	138	raise FileNotFoundError(path)
	139	return self.store[path].getbuffer().nbytes
	140
	141
	142	class MemoryFile(BytesIO):
	143	"""A BytesIO which can't close and works as a context manager
	144
	145	Can initialise with data
	146
	147	No need to provide fs, path if auto-committing (default)
	148	"""
	149
	150	def __init__(self, fs, path, data=None):
	151	self.fs = fs
	152	self.path = path
	153	if data:
	154	self.write(data)
	155	self.size = len(data)
	156	self.seek(0)
	157
	158	def __enter__(self):
	159	return self
	160
	161	def close(self):
	162	self.size = self.seek(0, 2)
	163
	164	def discard(self):
	165	pass
	166
	167	def commit(self):
	168	self.fs.store[self.path] = self

+139

-0

fsspec/implementations/sftp.py less more

	0	import paramiko
	1	from stat import S_ISDIR, S_ISLNK
	2	import types
	3	import uuid
	4	from .. import AbstractFileSystem
	5	from ..utils import infer_storage_options
	6
	7
	8	class SFTPFileSystem(AbstractFileSystem):
	9	"""Files over SFTP/SSH
	10
	11	Peer-to-peer filesystem over SSH using paramiko.
	12	"""
	13
	14	protocol = "sftp", "ssh"
	15
	16	def __init__(self, host, **ssh_kwargs):
	17	"""
	18
	19	Parameters
	20	----------
	21	host: str
	22	Hostname or IP as a string
	23	temppath: str
	24	Location on the server to put files, when within a transaction
	25	ssh_kwargs: dict
	26	Parameters passed on to connection. See details in
	27	http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect
	28	May include port, username, password...
	29	"""
	30	if self._cached:
	31	return
	32	super(SFTPFileSystem, self).__init__(**ssh_kwargs)
	33	self.temppath = ssh_kwargs.pop("temppath", "/tmp")
	34	self.host = host
	35	self.ssh_kwargs = ssh_kwargs
	36	self._connect()
	37
	38	def _connect(self):
	39	self.client = paramiko.SSHClient()
	40	self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
	41	self.client.connect(self.host, **self.ssh_kwargs)
	42	self.ftp = self.client.open_sftp()
	43
	44	@classmethod
	45	def _strip_protocol(cls, path):
	46	return infer_storage_options(path)["path"]
	47
	48	@staticmethod
	49	def _get_kwargs_from_urls(urlpath):
	50	out = infer_storage_options(urlpath)
	51	out.pop("path", None)
	52	out.pop("protocol", None)
	53	return out
	54
	55	def mkdir(self, path, mode=511):
	56	self.ftp.mkdir(path, mode)
	57
	58	def makedirs(self, path, exist_ok=False, mode=511):
	59	if self.exists(path) and not exist_ok:
	60	raise FileExistsError("File exists: {}".format(path))
	61
	62	parts = path.split("/")
	63	path = ""
	64
	65	for part in parts:
	66	path += "/" + part
	67	if not self.exists(path):
	68	self.mkdir(path, mode)
	69
	70	def rmdir(self, path):
	71	self.ftp.rmdir(path)
	72
	73	def info(self, path):
	74	s = self.ftp.stat(path)
	75	if S_ISDIR(s.st_mode):
	76	t = "directory"
	77	elif S_ISLNK(s.st_mode):
	78	t = "link"
	79	else:
	80	t = "file"
	81	return {
	82	"name": path + "/" if t == "directory" else path,
	83	"size": s.st_size,
	84	"type": t,
	85	"uid": s.st_uid,
	86	"gui": s.st_gid,
	87	"time": s.st_atime,
	88	"mtime": s.st_mtime,
	89	}
	90
	91	def ls(self, path, detail=False):
	92	out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)]
	93	out = [self.info(o) for o in out]
	94	if detail:
	95	return out
	96	return sorted([p["name"] for p in out])
	97
	98	def put(self, lpath, rpath):
	99	self.ftp.put(lpath, rpath)
	100
	101	def get(self, rpath, lpath):
	102	self.ftp.get(rpath, lpath)
	103
	104	def _open(self, path, mode="rb", block_size=None, **kwargs):
	105	"""
	106	block_size: int or None
	107	If 0, no buffering, if 1, line buffering, if >1, buffer that many
	108	bytes, if None use default from paramiko.
	109	"""
	110	if kwargs.get("autocommit", True) is False:
	111	# writes to temporary file, move on commit
	112	path2 = "{}/{}".format(self.temppath, uuid.uuid4())
	113	f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
	114	f.temppath = path2
	115	f.targetpath = path
	116	f.fs = self
	117	f.commit = types.MethodType(commit_a_file, f)
	118	f.discard = types.MethodType(discard_a_file, f)
	119	else:
	120	f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
	121	return f
	122
	123	def _rm(self, path):
	124	if self.isdir(path):
	125	self.ftp.rmdir(path)
	126	else:
	127	self.ftp.remove(path)
	128
	129	def mv(self, old, new):
	130	self.ftp.posix_rename(old, new)
	131
	132
	133	def commit_a_file(self):
	134	self.fs.mv(self.temppath, self.targetpath)
	135
	136
	137	def discard_a_file(self):
	138	self.fs._rm(self.temppath)

-0

fsspec/implementations/tests/__init__.py less more

(New empty file)

+314

-0

fsspec/implementations/tests/test_cached.py less more

	0	import os
	1	import shutil
	2	import pickle
	3	import pytest
	4
	5	import fsspec
	6	from fsspec.implementations.cached import CachingFileSystem
	7	from .test_ftp import FTPFileSystem
	8
	9
	10	@pytest.fixture
	11	def local_filecache():
	12	import tempfile
	13
	14	original_location = tempfile.mkdtemp()
	15	cache_location = tempfile.mkdtemp()
	16	original_file = os.path.join(original_location, "afile")
	17	data = b"test data"
	18	with open(original_file, "wb") as f:
	19	f.write(data)
	20
	21	# we can access the file and read it
	22	fs = fsspec.filesystem(
	23	"filecache", target_protocol="file", cache_storage=cache_location
	24	)
	25
	26	return (data, original_file, cache_location, fs)
	27
	28
	29	def test_idempotent():
	30	fs = CachingFileSystem("file")
	31	fs2 = CachingFileSystem("file")
	32	assert fs2 is fs
	33	fs3 = pickle.loads(pickle.dumps(fs))
	34	assert fs3.storage == fs.storage
	35
	36
	37	def test_workflow(ftp_writable):
	38	host, port, user, pw = ftp_writable
	39	fs = FTPFileSystem(host, port, user, pw)
	40	with fs.open("/out", "wb") as f:
	41	f.write(b"test")
	42	fs = fsspec.filesystem(
	43	"cached",
	44	target_protocol="ftp",
	45	target_options={"host": host, "port": port, "username": user, "password": pw},
	46	)
	47	assert os.listdir(fs.storage[-1]) == []
	48	with fs.open("/out") as f:
	49	assert os.listdir(fs.storage[-1])
	50	assert f.read() == b"test"
	51	assert fs.cached_files[-1]["ftp:///out"]["blocks"]
	52	assert fs.cat("/out") == b"test"
	53	assert fs.cached_files[-1]["ftp:///out"]["blocks"] is True
	54
	55	with fs.open("/out", "wb") as f:
	56	f.write(b"changed")
	57
	58	assert fs.cat("/out") == b"test" # old value
	59
	60
	61	def test_blocksize(ftp_writable):
	62	host, port, user, pw = ftp_writable
	63	fs = FTPFileSystem(host, port, user, pw)
	64	with fs.open("/out_block", "wb") as f:
	65	f.write(b"test" * 4000)
	66
	67	fs = fsspec.filesystem(
	68	"blockcache",
	69	target_protocol="ftp",
	70	target_options={"host": host, "port": port, "username": user, "password": pw},
	71	)
	72
	73	with fs.open("/out_block", block_size=20) as f:
	74	assert f.read(1) == b"t"
	75	with pytest.raises(ValueError):
	76	fs.open("/out_block", block_size=30)
	77
	78
	79	def test_local_filecache_creates_dir_if_needed():
	80	import tempfile
	81
	82	original_location = tempfile.mkdtemp()
	83	cache_location = "foofoobarbar"
	84	assert not os.path.exists(cache_location)
	85
	86	try:
	87	original_file = os.path.join(original_location, "afile")
	88	data = b"test data"
	89	with open(original_file, "wb") as f:
	90	f.write(data)
	91
	92	# we can access the file and read it
	93	fs = fsspec.filesystem(
	94	"filecache", target_protocol="file", cache_storage=cache_location
	95	)
	96
	97	with fs.open(original_file, "rb") as f:
	98	data_in_cache = f.read()
	99
	100	assert os.path.exists(cache_location)
	101
	102	finally:
	103	shutil.rmtree(cache_location)
	104
	105	assert data_in_cache == data
	106
	107
	108	def test_local_filecache_basic(local_filecache):
	109	data, original_file, cache_location, fs = local_filecache
	110
	111	# reading from the file contains the right data
	112	with fs.open(original_file, "rb") as f:
	113	assert f.read() == data
	114	assert "cache" in os.listdir(cache_location)
	115
	116	# the file in the location contains the right data
	117	fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value
	118	assert fn in os.listdir(cache_location)
	119	with open(os.path.join(cache_location, fn), "rb") as f:
	120	assert f.read() == data
	121
	122	# still there when original file is removed (check=False)
	123	os.remove(original_file)
	124	with fs.open(original_file, "rb") as f:
	125	assert f.read() == data
	126
	127
	128	def test_local_filecache_does_not_change_when_original_data_changed(local_filecache):
	129	old_data, original_file, cache_location, fs = local_filecache
	130	new_data = b"abc"
	131
	132	with fs.open(original_file, "rb") as f:
	133	assert f.read() == old_data
	134
	135	with open(original_file, "wb") as f:
	136	f.write(new_data)
	137
	138	with fs.open(original_file, "rb") as f:
	139	assert f.read() == old_data
	140
	141
	142	def test_local_filecache_gets_from_original_if_cache_deleted(local_filecache):
	143	old_data, original_file, cache_location, fs = local_filecache
	144	new_data = b"abc"
	145
	146	with fs.open(original_file, "rb") as f:
	147	assert f.read() == old_data
	148
	149	with open(original_file, "wb") as f:
	150	f.write(new_data)
	151
	152	shutil.rmtree(cache_location)
	153	assert os.path.exists(original_file)
	154
	155	with open(original_file, "rb") as f:
	156	assert f.read() == new_data
	157
	158	with fs.open(original_file, "rb") as f:
	159	assert f.read() == new_data
	160
	161	# the file in the location contains the right data
	162	fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value
	163	assert fn in os.listdir(cache_location)
	164	with open(os.path.join(cache_location, fn), "rb") as f:
	165	assert f.read() == new_data
	166
	167
	168	def test_local_filecache_with_new_cache_location_makes_a_new_copy(local_filecache):
	169	import tempfile
	170
	171	data, original_file, old_cache_location, old_fs = local_filecache
	172	new_cache_location = tempfile.mkdtemp()
	173
	174	with old_fs.open(original_file, "rb") as f:
	175	assert f.read() == data
	176
	177	new_fs = fsspec.filesystem(
	178	"filecache", target_protocol="file", cache_storage=new_cache_location
	179	)
	180
	181	with new_fs.open(original_file, "rb") as f:
	182	assert f.read() == data
	183
	184	# the file in the location contains the right data
	185	fn = list(new_fs.cached_files[-1].values())[0]["fn"] # this is a hash value
	186	assert fn in os.listdir(old_cache_location)
	187	assert fn in os.listdir(new_cache_location)
	188
	189	with open(os.path.join(new_cache_location, fn), "rb") as f:
	190	assert f.read() == data
	191
	192
	193	def test_filecache_multicache():
	194	import tempfile
	195
	196	origin = tempfile.mkdtemp()
	197	cache1 = tempfile.mkdtemp()
	198	cache2 = tempfile.mkdtemp()
	199	data = b"test data"
	200	f1 = os.path.join(origin, "afile")
	201	f2 = os.path.join(origin, "bfile")
	202	with open(f1, "wb") as f:
	203	f.write(data)
	204	with open(f2, "wb") as f:
	205	f.write(data * 2)
	206
	207	# populates first cache
	208	fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1)
	209	assert fs.cat(f1) == data
	210
	211	assert len(os.listdir(cache1)) == 2 # cache and hashed afile
	212	assert len(os.listdir(cache2)) == 0 # hasn't been intialized yet
	213
	214	# populates last cache if file not found in first cache
	215	fs = fsspec.filesystem(
	216	"filecache", target_protocol="file", cache_storage=[cache1, cache2]
	217	)
	218
	219	assert fs.cat(f1) == data
	220	assert fs.cat(f2) == data * 2
	221
	222	assert "cache" in os.listdir(cache1)
	223	assert "cache" in os.listdir(cache2)
	224
	225	cache1_contents = [f for f in os.listdir(cache1) if f != "cache"]
	226	assert len(cache1_contents) == 1
	227
	228	with open(os.path.join(cache1, cache1_contents[0]), "rb") as f:
	229	assert f.read() == data
	230
	231	cache2_contents = [f for f in os.listdir(cache2) if f != "cache"]
	232	assert len(cache2_contents) == 1
	233
	234	with open(os.path.join(cache2, cache2_contents[0]), "rb") as f:
	235	assert f.read() == data * 2
	236
	237
	238	def test_filecache_multicache_with_same_file_different_data_reads_from_first():
	239	import tempfile
	240
	241	origin = tempfile.mkdtemp()
	242	cache1 = tempfile.mkdtemp()
	243	cache2 = tempfile.mkdtemp()
	244	data = b"test data"
	245	f1 = os.path.join(origin, "afile")
	246	with open(f1, "wb") as f:
	247	f.write(data)
	248
	249	# populate first cache
	250	fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1)
	251	assert fs.cat(f1) == data
	252
	253	with open(f1, "wb") as f:
	254	f.write(data * 2)
	255
	256	# populate second cache
	257	fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache2)
	258
	259	assert fs.cat(f1) == data * 2
	260
	261	# the filenames in each cache are the same, but the data is different
	262	assert os.listdir(cache1) == os.listdir(cache2)
	263
	264	fs = fsspec.filesystem(
	265	"filecache", target_protocol="file", cache_storage=[cache1, cache2]
	266	)
	267
	268	assert fs.cat(f1) == data
	269
	270
	271	def test_filecache_with_checks():
	272	import tempfile
	273	import time
	274
	275	origin = tempfile.mkdtemp()
	276	cache1 = tempfile.mkdtemp()
	277	data = b"test data"
	278	f1 = os.path.join(origin, "afile")
	279	with open(f1, "wb") as f:
	280	f.write(data)
	281
	282	# populate first cache
	283	fs = fsspec.filesystem(
	284	"filecache", target_protocol="file", cache_storage=cache1, expiry_time=0.1
	285	)
	286	fs2 = fsspec.filesystem(
	287	"filecache", target_protocol="file", cache_storage=cache1, check_files=True
	288	)
	289	assert fs.cat(f1) == data
	290	assert fs2.cat(f1) == data
	291
	292	with open(f1, "wb") as f:
	293	f.write(data * 2)
	294
	295	assert fs.cat(f1) == data # does not change
	296	assert fs2.cat(f1) == data * 2 # changed, since origin changed
	297	time.sleep(0.11) # allow cache details to expire
	298	assert fs.cat(f1) == data * 2 # changed, since origin changed
	299
	300
	301	def test_takes_fs_instance():
	302	import tempfile
	303
	304	origin = tempfile.mkdtemp()
	305	data = b"test data"
	306	f1 = os.path.join(origin, "afile")
	307	with open(f1, "wb") as f:
	308	f.write(data)
	309
	310	fs = fsspec.filesystem("file")
	311	fs2 = fsspec.filesystem("filecache", target_protocol=fs)
	312
	313	assert fs2.cat(f1) == data

+29

-0

fsspec/implementations/tests/test_dask.py less more

	0	import pytest
	1	import fsspec
	2
	3	pytest.importorskip("distributed")
	4
	5
	6	@pytest.fixture()
	7	def cli(tmpdir):
	8	import dask.distributed
	9
	10	client = dask.distributed.Client(n_workers=1)
	11
	12	def setup():
	13	m = fsspec.filesystem("memory")
	14	with m.open("afile", "wb") as f:
	15	f.write(b"data")
	16
	17	client.run(setup)
	18	try:
	19	yield client
	20	finally:
	21	client.close()
	22
	23
	24	def test_basic(cli):
	25
	26	fs = fsspec.filesystem("dask", remote_protocol="memory")
	27	assert fs.ls("") == ["afile"]
	28	assert fs.cat("afile") == b"data"

+116

-0

fsspec/implementations/tests/test_ftp.py less more

	0	import os
	1	import pytest
	2	import subprocess
	3	import sys
	4	import time
	5
	6	from fsspec.implementations.ftp import FTPFileSystem
	7	from fsspec import open_files
	8	import fsspec
	9
	10	here = os.path.dirname(os.path.abspath(__file__))
	11
	12
	13	@pytest.fixture()
	14	def ftp():
	15	P = subprocess.Popen(
	16	[sys.executable, "-m", "pyftpdlib", "-d", here],
	17	stderr=subprocess.STDOUT,
	18	stdout=subprocess.PIPE,
	19	)
	20	try:
	21	time.sleep(1)
	22	yield "localhost", 2121
	23	finally:
	24	P.terminate()
	25	P.wait()
	26
	27
	28	def test_basic(ftp):
	29	host, port = ftp
	30	fs = FTPFileSystem(host, port)
	31	assert fs.ls("/", detail=False) == sorted(os.listdir(here))
	32	out = fs.cat("/" + os.path.basename(__file__))
	33	assert out == open(__file__, "rb").read()
	34
	35
	36	def test_not_cached(ftp):
	37	host, port = ftp
	38	fs = FTPFileSystem(host, port)
	39	fs2 = FTPFileSystem(host, port)
	40	assert fs is not fs2
	41
	42
	43	@pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
	44	def test_complex(ftp_writable, cache_type):
	45	from fsspec.core import BytesCache
	46
	47	host, port, user, pw = ftp_writable
	48	files = open_files(
	49	"ftp:///ou*",
	50	host=host,
	51	port=port,
	52	username=user,
	53	password=pw,
	54	block_size=10000,
	55	cache_type=cache_type,
	56	)
	57	assert len(files) == 1
	58	with files[0] as fo:
	59	assert fo.read(10) == b"hellohello"
	60	if isinstance(fo.cache, BytesCache):
	61	assert len(fo.cache.cache) == 10010
	62	assert fo.read(2) == b"he"
	63	assert fo.tell() == 12
	64
	65
	66	def test_write_small(ftp_writable):
	67	host, port, user, pw = ftp_writable
	68	fs = FTPFileSystem(host, port, user, pw)
	69	with fs.open("/out2", "wb") as f:
	70	f.write(b"oi")
	71	assert fs.cat("/out2") == b"oi"
	72
	73
	74	def test_with_url(ftp_writable):
	75	host, port, user, pw = ftp_writable
	76	fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "wb")
	77	with fo as f:
	78	f.write(b"hello")
	79	fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "rb")
	80	with fo as f:
	81	assert f.read() == b"hello"
	82
	83
	84	@pytest.mark.parametrize("cache_type", ["bytes", "mmap"])
	85	def test_write_big(ftp_writable, cache_type):
	86	host, port, user, pw = ftp_writable
	87	fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type)
	88	fn = "/bigger"
	89	with fs.open(fn, "wb") as f:
	90	f.write(b"o" * 500)
	91	assert not fs.exists(fn)
	92	f.write(b"o" * 1000)
	93	fs.invalidate_cache()
	94	assert fs.exists(fn)
	95	f.write(b"o" * 200)
	96	f.flush()
	97
	98	assert fs.info(fn)["size"] == 1700
	99	assert fs.cat(fn) == b"o" * 1700
	100
	101
	102	def test_transaction(ftp_writable):
	103	host, port, user, pw = ftp_writable
	104	fs = FTPFileSystem(host, port, user, pw)
	105	fs.mkdir("/tmp")
	106	fn = "/tr"
	107	with fs.transaction:
	108	with fs.open(fn, "wb") as f:
	109	f.write(b"not")
	110	assert not fs.exists(fn)
	111	assert fs.exists(fn)
	112	assert fs.cat(fn) == b"not"
	113
	114	fs.rm(fn)
	115	assert not fs.exists(fn)

+154

-0

fsspec/implementations/tests/test_http.py less more

	0	import pytest
	1	from http.server import BaseHTTPRequestHandler, HTTPServer
	2	import threading
	3	import fsspec
	4
	5	requests = pytest.importorskip("requests")
	6	port = 9898
	7	data = b"\n".join([b"some test data"] * 1000)
	8	realfile = "http://localhost:%i/index/realfile" % port
	9	index = b'<a href="%s">Link</a>' % realfile.encode()
	10
	11
	12	class HTTPTestHandler(BaseHTTPRequestHandler):
	13	def _respond(self, code=200, headers=None, data=b""):
	14	headers = headers or {}
	15	headers.update({"User-Agent": "test"})
	16	self.send_response(code)
	17	for k, v in headers.items():
	18	self.send_header(k, str(v))
	19	self.end_headers()
	20	if data:
	21	self.wfile.write(data)
	22
	23	def do_GET(self):
	24	if self.path not in ["/index/realfile", "/index"]:
	25	self._respond(404)
	26	return
	27
	28	d = data if self.path == "/index/realfile" else index
	29	if "Range" in self.headers:
	30	ran = self.headers["Range"]
	31	b, ran = ran.split("=")
	32	start, end = ran.split("-")
	33	print(start)
	34	print(end)
	35	d = d[int(start) : int(end) + 1]
	36	if "give_length" in self.headers:
	37	response_headers = {"Content-Length": len(d)}
	38	self._respond(200, response_headers, d)
	39	elif "give_range" in self.headers:
	40	self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))}, d)
	41	else:
	42	self._respond(200, data=d)
	43
	44	def do_HEAD(self):
	45	if "head_ok" not in self.headers:
	46	self._respond(405)
	47	return
	48	d = data if self.path == "/index/realfile" else index
	49	if self.path not in ["/index/realfile", "/index"]:
	50	self._respond(404)
	51	elif "give_length" in self.headers:
	52	response_headers = {"Content-Length": len(d)}
	53	if "zero_length" in self.headers:
	54	response_headers["Content-Length"] = 0
	55
	56	self._respond(200, response_headers)
	57	elif "give_range" in self.headers:
	58	self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))})
	59	else:
	60	self._respond(200) # OK response, but no useful info
	61
	62
	63	@pytest.fixture(scope="module")
	64	def server():
	65	server_address = ("", port)
	66	httpd = HTTPServer(server_address, HTTPTestHandler)
	67	th = threading.Thread(target=httpd.serve_forever)
	68	th.daemon = True
	69	th.start()
	70	try:
	71	yield "http://localhost:%i" % port
	72	finally:
	73	httpd.socket.close()
	74	httpd.shutdown()
	75	th.join()
	76
	77
	78	def test_list(server):
	79	h = fsspec.filesystem("http")
	80	out = h.glob(server + "/index/*")
	81	assert out == [server + "/index/realfile"]
	82
	83
	84	def test_policy_arg(server):
	85	h = fsspec.filesystem("http", size_policy="get")
	86	out = h.glob(server + "/index/*")
	87	assert out == [server + "/index/realfile"]
	88
	89
	90	def test_exists(server):
	91	h = fsspec.filesystem("http")
	92	assert not h.exists(server + "/notafile")
	93
	94
	95	def test_read(server):
	96	h = fsspec.filesystem("http")
	97	out = server + "/index/realfile"
	98	with h.open(out, "rb") as f:
	99	assert f.read() == data
	100	with h.open(out, "rb", block_size=0) as f:
	101	assert f.read() == data
	102	with h.open(out, "rb") as f:
	103	assert f.read(100) + f.read() == data
	104
	105
	106	def test_methods(server):
	107	h = fsspec.filesystem("http")
	108	url = server + "/index/realfile"
	109	assert h.exists(url)
	110	assert h.cat(url) == data
	111
	112
	113	@pytest.mark.parametrize(
	114	"headers",
	115	[
	116	{},
	117	{"give_length": "true"},
	118	{"give_length": "true", "head_ok": "true"},
	119	{"give_range": "true"},
	120	],
	121	)
	122	def test_random_access(server, headers):
	123	h = fsspec.filesystem("http", headers=headers)
	124	url = server + "/index/realfile"
	125	with h.open(url, "rb") as f:
	126	if headers:
	127	assert f.size == len(data)
	128	assert f.read(5) == data[:5]
	129	# python server does not respect bytes range request
	130	# we actually get all the data
	131	f.seek(5, 1)
	132	assert f.read(5) == data[10:15]
	133
	134
	135	def test_mapper_url(server):
	136	h = fsspec.filesystem("http")
	137	mapper = h.get_mapper(server + "/index/")
	138	assert mapper.root.startswith("http:")
	139	assert list(mapper)
	140
	141	mapper2 = fsspec.get_mapper(server + "/index/")
	142	assert mapper2.root.startswith("http:")
	143	assert list(mapper) == list(mapper2)
	144
	145
	146	def test_content_length_zero(server):
	147	h = fsspec.filesystem(
	148	"http", headers={"give_length": "true", "zero_length": "true"}
	149	)
	150	url = server + "/index/realfile"
	151
	152	with h.open(url, "rb") as f:
	153	assert f.read() == data

+429

-0

fsspec/implementations/tests/test_local.py less more

	0	from __future__ import print_function, division, absolute_import
	1
	2	import gzip
	3	import os
	4	import os.path
	5	import sys
	6	from contextlib import contextmanager
	7	import tempfile
	8
	9	import pytest
	10	import fsspec
	11	from fsspec.core import open_files, get_fs_token_paths, OpenFile
	12	from fsspec.implementations.local import LocalFileSystem, make_path_posix
	13	from fsspec import compression
	14
	15	files = {
	16	".test.accounts.1.json": (
	17	b'{"amount": 100, "name": "Alice"}\n'
	18	b'{"amount": 200, "name": "Bob"}\n'
	19	b'{"amount": 300, "name": "Charlie"}\n'
	20	b'{"amount": 400, "name": "Dennis"}\n'
	21	),
	22	".test.accounts.2.json": (
	23	b'{"amount": 500, "name": "Alice"}\n'
	24	b'{"amount": 600, "name": "Bob"}\n'
	25	b'{"amount": 700, "name": "Charlie"}\n'
	26	b'{"amount": 800, "name": "Dennis"}\n'
	27	),
	28	}
	29
	30
	31	csv_files = {
	32	".test.fakedata.1.csv": (b"a,b\n" b"1,2\n"),
	33	".test.fakedata.2.csv": (b"a,b\n" b"3,4\n"),
	34	}
	35
	36
	37	@contextmanager
	38	def filetexts(d, open=open, mode="t"):
	39	""" Dumps a number of textfiles to disk
	40
	41	d - dict
	42	a mapping from filename to text like {'a.csv': '1,1\n2,2'}
	43
	44	Since this is meant for use in tests, this context manager will
	45	automatically switch to a temporary current directory, to avoid
	46	race conditions when running tests in parallel.
	47	"""
	48	odir = os.getcwd()
	49	dirname = tempfile.mkdtemp()
	50	try:
	51	os.chdir(dirname)
	52	for filename, text in d.items():
	53	f = open(filename, "w" + mode)
	54	try:
	55	f.write(text)
	56	finally:
	57	try:
	58	f.close()
	59	except AttributeError:
	60	pass
	61
	62	yield list(d)
	63
	64	for filename in d:
	65	if os.path.exists(filename):
	66	try:
	67	os.remove(filename)
	68	except (IOError, OSError):
	69	pass
	70	finally:
	71	os.chdir(odir)
	72
	73
	74	def test_urlpath_inference_strips_protocol(tmpdir):
	75	tmpdir = str(tmpdir)
	76	paths = [os.path.join(tmpdir, "test.%02d.csv" % i) for i in range(20)]
	77
	78	for path in paths:
	79	with open(path, "wb") as f:
	80	f.write(b"1,2,3\n" * 10)
	81
	82	# globstring
	83	protocol = "file:///" if sys.platform == "win32" else "file://"
	84	urlpath = protocol + os.path.join(tmpdir, "test.*.csv")
	85	_, _, paths2 = get_fs_token_paths(urlpath)
	86	assert paths2 == paths
	87
	88	# list of paths
	89	_, _, paths2 = get_fs_token_paths([protocol + p for p in paths])
	90	assert paths2 == paths
	91
	92
	93	def test_urlpath_inference_errors():
	94	# Empty list
	95	with pytest.raises(ValueError) as err:
	96	get_fs_token_paths([])
	97	assert "empty" in str(err.value)
	98
	99	# Protocols differ
	100	with pytest.raises(ValueError) as err:
	101	get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"])
	102	assert "same protocol" in str(err.value)
	103
	104	# Unknown type
	105	with pytest.raises(TypeError):
	106	get_fs_token_paths(
	107	{"sets/are.csv", "unordered/so/they.csv", "should/not/be.csvallowed.csv"}
	108	)
	109
	110
	111	def test_urlpath_expand_read():
	112	"""Make sure * is expanded in file paths when reading."""
	113	# when reading, globs should be expanded to read files by mask
	114	with filetexts(csv_files, mode="b"):
	115	_, _, paths = get_fs_token_paths("./.*.csv")
	116	assert len(paths) == 2
	117	_, _, paths = get_fs_token_paths(["./.*.csv"])
	118	assert len(paths) == 2
	119
	120
	121	def test_urlpath_expand_write():
	122	"""Make sure * is expanded in file paths when writing."""
	123	_, _, paths = get_fs_token_paths("prefix-*.csv", mode="wb", num=2)
	124	assert all(
	125	[p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])]
	126	)
	127	_, _, paths = get_fs_token_paths(["prefix-*.csv"], mode="wb", num=2)
	128	assert all(
	129	[p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])]
	130	)
	131	# we can read with multiple masks, but not write
	132	with pytest.raises(ValueError):
	133	_, _, paths = get_fs_token_paths(
	134	["prefix1-.csv", "prefix2-.csv"], mode="wb", num=2
	135	)
	136
	137
	138	def test_open_files():
	139	with filetexts(files, mode="b"):
	140	myfiles = open_files("./.test.accounts.*")
	141	assert len(myfiles) == len(files)
	142	for lazy_file, data_file in zip(myfiles, sorted(files)):
	143	with lazy_file as f:
	144	x = f.read()
	145	assert x == files[data_file]
	146
	147
	148	@pytest.mark.parametrize("encoding", ["utf-8", "ascii"])
	149	def test_open_files_text_mode(encoding):
	150	with filetexts(files, mode="b"):
	151	myfiles = open_files("./.test.accounts.*", mode="rt", encoding=encoding)
	152	assert len(myfiles) == len(files)
	153	data = []
	154	for file in myfiles:
	155	with file as f:
	156	data.append(f.read())
	157	assert list(data) == [files[k].decode(encoding) for k in sorted(files)]
	158
	159
	160	@pytest.mark.parametrize("mode", ["rt", "rb"])
	161	@pytest.mark.parametrize("fmt", list(compression.compr))
	162	def test_compressions(fmt, mode, tmpdir):
	163	if fmt == "zip" and sys.version_info < (3, 6):
	164	pytest.xfail("zip compression requires python3.6 or higher")
	165
	166	tmpdir = str(tmpdir)
	167	fn = os.path.join(tmpdir, ".tmp.getsize")
	168	fs = LocalFileSystem()
	169	f = OpenFile(fs, fn, compression=fmt, mode="wb")
	170	data = b"Long line of readily compressible text"
	171	with f as fo:
	172	fo.write(data)
	173	if fmt is None:
	174	assert fs.size(fn) == len(data)
	175	else:
	176	assert fs.size(fn) != len(data)
	177
	178	f = OpenFile(fs, fn, compression=fmt, mode=mode)
	179	with f as fo:
	180	if mode == "rb":
	181	assert fo.read() == data
	182	else:
	183	assert fo.read() == data.decode()
	184
	185
	186	def test_bad_compression():
	187	with filetexts(files, mode="b"):
	188	for func in [open_files]:
	189	with pytest.raises(ValueError):
	190	func("./.test.accounts.*", compression="not-found")
	191
	192
	193	def test_not_found():
	194	fn = "not-a-file"
	195	fs = LocalFileSystem()
	196	with pytest.raises((FileNotFoundError, OSError)):
	197	with OpenFile(fs, fn, mode="rb"):
	198	pass
	199
	200
	201	def test_isfile():
	202	fs = LocalFileSystem()
	203	with filetexts(files, mode="b"):
	204	for f in files.keys():
	205	assert fs.isfile(f)
	206	assert not fs.isfile("not-a-file")
	207
	208
	209	def test_isdir():
	210	fs = LocalFileSystem()
	211	with filetexts(files, mode="b"):
	212	for f in files.keys():
	213	assert fs.isdir(os.path.dirname(os.path.abspath(f)))
	214	assert not fs.isdir(f)
	215	assert not fs.isdir("not-a-dir")
	216
	217
	218	@pytest.mark.parametrize("compression_opener", [(None, open), ("gzip", gzip.open)])
	219	def test_open_files_write(tmpdir, compression_opener):
	220	tmpdir = str(tmpdir)
	221	compression, opener = compression_opener
	222	fn = str(tmpdir) + "/*.part"
	223	files = open_files(fn, num=2, mode="wb", compression=compression)
	224	assert len(files) == 2
	225	assert {f.mode for f in files} == {"wb"}
	226	for fil in files:
	227	with fil as f:
	228	f.write(b"000")
	229	files = sorted(os.listdir(tmpdir))
	230	assert files == ["0.part", "1.part"]
	231
	232	with opener(os.path.join(tmpdir, files[0]), "rb") as f:
	233	d = f.read()
	234	assert d == b"000"
	235
	236
	237	def test_pickability_of_lazy_files(tmpdir):
	238	tmpdir = str(tmpdir)
	239	cloudpickle = pytest.importorskip("cloudpickle")
	240
	241	with filetexts(files, mode="b"):
	242	myfiles = open_files("./.test.accounts.*")
	243	myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles))
	244
	245	for f, f2 in zip(myfiles, myfiles2):
	246	assert f.path == f2.path
	247	assert isinstance(f.fs, type(f2.fs))
	248	with f as f_open, f2 as f2_open:
	249	assert f_open.read() == f2_open.read()
	250
	251
	252	def test_abs_paths(tmpdir):
	253	tmpdir = str(tmpdir)
	254	here = os.getcwd()
	255	os.chdir(tmpdir)
	256	with open("tmp", "w") as f:
	257	f.write("hi")
	258	out = LocalFileSystem().glob("./*")
	259	assert len(out) == 1
	260	assert os.sep in out[0]
	261	assert "tmp" in out[0]
	262
	263	# I don't know what this was testing - but should avoid local paths anyway
	264	# fs = LocalFileSystem()
	265	os.chdir(here)
	266	# with fs.open('tmp', 'r') as f:
	267	# res = f.read()
	268	# assert res == 'hi'
	269
	270
	271	@pytest.mark.parametrize("sep", ["/", "\\"])
	272	@pytest.mark.parametrize("chars", ["+", "++", "(", ")", "\|", "\\"])
	273	def test_glob_weird_characters(tmpdir, sep, chars):
	274	tmpdir = str(tmpdir)
	275
	276	subdir = tmpdir + sep + "test" + chars + "x"
	277	os.mkdir(subdir)
	278	with open(subdir + sep + "tmp", "w") as f:
	279	f.write("hi")
	280
	281	out = LocalFileSystem().glob(subdir + sep + "*")
	282	assert len(out) == 1
	283	assert os.sep in out[0]
	284	assert "tmp" in out[0]
	285
	286
	287	def test_globfind_dirs(tmpdir):
	288	tmpdir = str(tmpdir)
	289	fs = fsspec.filesystem("file")
	290	fs.mkdir(tmpdir + "/dir")
	291	fs.touch(tmpdir + "/dir/afile")
	292	assert [tmpdir + "/dir"] == fs.glob(tmpdir + "/*")
	293	assert [tmpdir + "/dir/afile"] == fs.find(tmpdir)
	294	assert [tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(tmpdir, withdirs=True)
	295
	296
	297	def test_get_pyarrow_filesystem():
	298	pa = pytest.importorskip("pyarrow")
	299
	300	fs = LocalFileSystem()
	301	assert isinstance(fs, pa.filesystem.FileSystem)
	302	assert fs._get_pyarrow_filesystem() is fs
	303
	304	class UnknownFileSystem(object):
	305	pass
	306
	307	assert not isinstance(UnknownFileSystem(), pa.filesystem.FileSystem)
	308
	309
	310	def test_directories(tmpdir):
	311	tmpdir = str(tmpdir)
	312	fs = LocalFileSystem()
	313	fs.mkdir(tmpdir + "/dir")
	314	assert tmpdir + "/dir" in fs.ls(tmpdir)
	315	assert fs.ls(tmpdir, True)[0]["type"] == "directory"
	316	fs.rmdir(tmpdir + "/dir")
	317	assert not fs.ls(tmpdir)
	318
	319
	320	def test_file_ops(tmpdir):
	321	tmpdir = str(tmpdir)
	322	fs = LocalFileSystem()
	323	with pytest.raises(FileNotFoundError):
	324	fs.info(tmpdir + "/nofile")
	325	fs.touch(tmpdir + "/afile")
	326	i1 = fs.ukey(tmpdir + "/afile")
	327
	328	assert tmpdir + "/afile" in fs.ls(tmpdir)
	329
	330	with fs.open(tmpdir + "/afile", "wb") as f:
	331	f.write(b"data")
	332	i2 = fs.ukey(tmpdir + "/afile")
	333	assert i1 != i2 # because file changed
	334
	335	fs.copy(tmpdir + "/afile", tmpdir + "/afile2")
	336	assert tmpdir + "/afile2" in fs.ls(tmpdir)
	337
	338	fs.move(tmpdir + "/afile", tmpdir + "/afile3")
	339	assert not fs.exists(tmpdir + "/afile")
	340
	341	fs.rm(tmpdir + "/afile3", recursive=True)
	342	assert not fs.exists(tmpdir + "/afile3")
	343
	344	fs.rm(tmpdir, recursive=True)
	345	assert not fs.exists(tmpdir)
	346
	347
	348	def test_recursive_get_put(tmpdir):
	349	tmpdir = str(tmpdir)
	350	fs = LocalFileSystem()
	351
	352	fs.mkdir(tmpdir + "/a1/a2/a3")
	353	fs.touch(tmpdir + "/a1/a2/a3/afile")
	354	fs.touch(tmpdir + "/a1/afile")
	355
	356	fs.get("file://{0}/a1".format(tmpdir), tmpdir + "/b1", recursive=True)
	357	assert fs.isfile(tmpdir + "/b1/afile")
	358	assert fs.isfile(tmpdir + "/b1/a2/a3/afile")
	359
	360	fs.put(tmpdir + "/b1", "file://{0}/c1".format(tmpdir), recursive=True)
	361	assert fs.isfile(tmpdir + "/c1/afile")
	362	assert fs.isfile(tmpdir + "/c1/a2/a3/afile")
	363
	364
	365	def test_commit_discard(tmpdir):
	366	tmpdir = str(tmpdir)
	367	fs = LocalFileSystem()
	368	with fs.transaction:
	369	with fs.open(tmpdir + "/afile", "wb") as f:
	370	assert not fs.exists(tmpdir + "/afile")
	371	f.write(b"data")
	372	assert not fs.exists(tmpdir + "/afile")
	373
	374	assert fs._transaction is None
	375	assert fs.cat(tmpdir + "/afile") == b"data"
	376
	377	try:
	378	with fs.transaction:
	379	with fs.open(tmpdir + "/bfile", "wb") as f:
	380	f.write(b"data")
	381	raise KeyboardInterrupt
	382	except KeyboardInterrupt:
	383	assert not fs.exists(tmpdir + "/bfile")
	384
	385
	386	def test_make_path_posix():
	387	cwd = os.getcwd()
	388	assert make_path_posix("/a/posix/path") == "/a/posix/path"
	389	assert make_path_posix("/posix") == "/posix"
	390	assert make_path_posix("relpath", sep="/") == os.path.join(cwd, "relpath")
	391	assert make_path_posix("rel/path", sep="/") == os.path.join(cwd, "rel/path")
	392	assert make_path_posix("C:\\path", sep="\\") == "C:/path"
	393	assert (
	394	make_path_posix(
	395	"\\\\windows-server\\someshare\\path\\more\\path\\dir\\foo.parquet"
	396	)
	397	== "//windows-server/someshare/path/more/path/dir/foo.parquet"
	398	)
	399	assert "/" in make_path_posix("rel\\path", sep="\\")
	400
	401
	402	def test_links(tmpdir):
	403	tmpdir = str(tmpdir)
	404	fn0 = os.path.join(tmpdir, "target")
	405	fn1 = os.path.join(tmpdir, "link1")
	406	fn2 = os.path.join(tmpdir, "link2")
	407	data = b"my target data"
	408	with open(fn0, "wb") as f:
	409	f.write(data)
	410	os.symlink(fn0, fn1)
	411	os.symlink(fn0, fn2)
	412
	413	fs = LocalFileSystem()
	414	assert fs.info(fn0)["type"] == "file"
	415	assert fs.info(fn1)["type"] == "link"
	416	assert fs.info(fn2)["type"] == "link"
	417
	418	assert fs.info(fn0)["size"] == len(data)
	419	assert fs.info(fn1)["size"] == len(data)
	420	assert fs.info(fn2)["size"] == len(data)
	421
	422	of = fsspec.open(fn1, "rb")
	423	with of as f:
	424	assert f.read() == data
	425
	426	of = fsspec.open(fn2, "rb")
	427	with of as f:
	428	assert f.read() == data

+26

-0

fsspec/implementations/tests/test_memory.py less more

	0	import pytest
	1	import sys
	2
	3
	4	def test_1(m):
	5	m.touch("/somefile") # NB: is found with or without initial /
	6	m.touch("afiles/and/anothers")
	7	assert m.find("") == ["afiles/and/anothers", "somefile"]
	8	assert list(m.get_mapper("")) == ["afiles/and/anothers", "somefile"]
	9
	10
	11	@pytest.mark.xfail(
	12	sys.version_info < (3, 6),
	13	reason="py35 error, see https://github.com/intake/filesystem_spec/issues/148",
	14	)
	15	def test_ls(m):
	16	m.touch("/dir/afile")
	17	m.touch("/dir/dir1/bfile")
	18	m.touch("/dir/dir1/cfile")
	19
	20	assert m.ls("/", False) == ["/dir/"]
	21	assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1/"]
	22	assert m.ls("/dir", True)[0]["type"] == "file"
	23	assert m.ls("/dir", True)[1]["type"] == "directory"
	24
	25	assert len(m.ls("/dir/dir1")) == 2

+113

-0

fsspec/implementations/tests/test_sftp.py less more

	0	import pytest
	1	import shlex
	2	import subprocess
	3	import time
	4	import fsspec
	5
	6	pytest.importorskip("paramiko")
	7
	8
	9	def stop_docker(name):
	10	cmd = shlex.split('docker ps -a -q --filter "name=%s"' % name)
	11	cid = subprocess.check_output(cmd).strip().decode()
	12	if cid:
	13	subprocess.call(["docker", "rm", "-f", cid])
	14
	15
	16	@pytest.fixture(scope="module")
	17	def ssh():
	18	try:
	19	subprocess.check_call(["docker", "run", "hello-world"])
	20	except subprocess.CalledProcessError:
	21	pytest.skip("docker run not available")
	22	return
	23
	24	# requires docker
	25	cmds = [
	26	r"apt-get update",
	27	r"apt-get install -y openssh-server",
	28	r"mkdir /var/run/sshd",
	29	"bash -c \"echo 'root:pass' \| chpasswd\"",
	30	(
	31	r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' "
	32	r"/etc/ssh/sshd_config"
	33	),
	34	(
	35	r"sed 's@session\srequired\spam_loginuid.so@session optional "
	36	r"pam_loginuid.so@g' -i /etc/pam.d/sshd"
	37	),
	38	r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"',
	39	r"/usr/sbin/sshd",
	40	]
	41	name = "fsspec_sftp"
	42	stop_docker(name)
	43	cmd = "docker run -d -p 9200:22 --name {} ubuntu:16.04 sleep 9000".format(name)
	44	cid = subprocess.check_output(shlex.split(cmd)).strip().decode()
	45	for cmd in cmds:
	46	subprocess.call(["docker", "exec", cid] + shlex.split(cmd))
	47	try:
	48	time.sleep(1)
	49	yield dict(host="localhost", port=9200, username="root", password="pass")
	50	finally:
	51	stop_docker(name)
	52
	53
	54	def test_simple(ssh):
	55	f = fsspec.get_filesystem_class("sftp")(**ssh)
	56	f.mkdirs("/home/someuser/deeper")
	57	f.touch("/home/someuser/deeper/afile")
	58	assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
	59	assert f.ls("/home/someuser/deeper/") == ["/home/someuser/deeper/afile"]
	60	assert f.info("/home/someuser/deeper/afile")["type"] == "file"
	61	assert f.info("/home/someuser/deeper/afile")["size"] == 0
	62	assert f.exists("/home/someuser")
	63	f.rm("/home/someuser", recursive=True)
	64	assert not f.exists("/home/someuser")
	65
	66
	67	@pytest.mark.parametrize("protocol", ["sftp", "ssh"])
	68	def test_with_url(protocol, ssh):
	69	fo = fsspec.open(
	70	protocol + "://{username}:{password}@{host}:{port}"
	71	"/home/someuserout".format(**ssh),
	72	"wb",
	73	)
	74	with fo as f:
	75	f.write(b"hello")
	76	fo = fsspec.open(
	77	protocol + "://{username}:{password}@{host}:{port}"
	78	"/home/someuserout".format(**ssh),
	79	"rb",
	80	)
	81	with fo as f:
	82	assert f.read() == b"hello"
	83
	84
	85	def test_transaction(ssh):
	86	f = fsspec.get_filesystem_class("sftp")(**ssh)
	87	f.mkdirs("/home/someuser/deeper")
	88	f.start_transaction()
	89	f.touch("/home/someuser/deeper/afile")
	90	assert f.find("/home/someuser") == []
	91	f.end_transaction()
	92	f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
	93
	94	with f.transaction:
	95	assert f._intrans
	96	f.touch("/home/someuser/deeper/afile2")
	97	assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"]
	98	assert f.find("/home/someuser") == [
	99	"/home/someuser/deeper/afile",
	100	"/home/someuser/deeper/afile2",
	101	]
	102
	103
	104	def test_makedirs_exist_ok(ssh):
	105	f = fsspec.get_filesystem_class("sftp")(**ssh)
	106
	107	f.makedirs("/a/b/c")
	108
	109	with pytest.raises(FileExistsError, match="/a/b/c"):
	110	f.makedirs("/a/b/c", exist_ok=False)
	111
	112	f.makedirs("/a/b/c", exist_ok=True)

+115

-0

fsspec/implementations/tests/test_webhdfs.py less more

	0	import pickle
	1	import pytest
	2	import subprocess
	3	import time
	4	import fsspec
	5
	6	requests = pytest.importorskip("requests")
	7
	8	from fsspec.implementations.webhdfs import WebHDFS # noqa: E402
	9
	10
	11	@pytest.fixture(scope="module")
	12	def hdfs_cluster():
	13	cmd0 = "htcluster shutdown".split()
	14	try:
	15	subprocess.check_output(cmd0, stderr=subprocess.STDOUT)
	16	except FileNotFoundError:
	17	pytest.skip("htcluster not found")
	18	except subprocess.CalledProcessError as ex:
	19	pytest.skip("htcluster failed: " + ex.output.decode())
	20	cmd1 = "htcluster startup --image base".split()
	21	subprocess.check_output(cmd1)
	22	try:
	23	while True:
	24	t = 90
	25	try:
	26	requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS")
	27	except: # noqa: E722
	28	t -= 1
	29	assert t > 0, "Timeout waiting for HDFS"
	30	time.sleep(1)
	31	continue
	32	break
	33	time.sleep(7)
	34	yield "localhost"
	35	finally:
	36	subprocess.check_output(cmd0)
	37
	38
	39	def test_pickle(hdfs_cluster):
	40	w = WebHDFS(hdfs_cluster, user="testuser")
	41	w2 = pickle.loads(pickle.dumps(w))
	42	assert w == w2
	43
	44
	45	def test_simple(hdfs_cluster):
	46	w = WebHDFS(hdfs_cluster, user="testuser")
	47	home = w.home_directory()
	48	assert home == "/user/testuser"
	49	with pytest.raises(PermissionError):
	50	w.mkdir("/root")
	51
	52
	53	def test_url(hdfs_cluster):
	54	url = "webhdfs://testuser@localhost:50070/user/testuser/myfile"
	55	fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"})
	56	with fo as f:
	57	f.write(b"hello")
	58	fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"})
	59	with fo as f:
	60	assert f.read() == b"hello"
	61
	62
	63	def test_workflow(hdfs_cluster):
	64	w = WebHDFS(
	65	hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
	66	)
	67	fn = "/user/testuser/testrun/afile"
	68	w.mkdir("/user/testuser/testrun")
	69	with w.open(fn, "wb") as f:
	70	f.write(b"hello")
	71	assert w.exists(fn)
	72	info = w.info(fn)
	73	assert info["size"] == 5
	74	assert w.isfile(fn)
	75	assert w.cat(fn) == b"hello"
	76	w.rm("/user/testuser/testrun", recursive=True)
	77	assert not w.exists(fn)
	78
	79
	80	def test_with_gzip(hdfs_cluster):
	81	from gzip import GzipFile
	82
	83	w = WebHDFS(
	84	hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
	85	)
	86	fn = "/user/testuser/gzfile"
	87	with w.open(fn, "wb") as f:
	88	gf = GzipFile(fileobj=f, mode="w")
	89	gf.write(b"hello")
	90	gf.close()
	91	with w.open(fn, "rb") as f:
	92	gf = GzipFile(fileobj=f, mode="r")
	93	assert gf.read() == b"hello"
	94
	95
	96	def test_workflow_transaction(hdfs_cluster):
	97	w = WebHDFS(
	98	hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
	99	)
	100	fn = "/user/testuser/testrun/afile"
	101	w.mkdirs("/user/testuser/testrun")
	102	with w.transaction:
	103	with w.open(fn, "wb") as f:
	104	f.write(b"hello")
	105	assert not w.exists(fn)
	106	assert w.exists(fn)
	107	assert w.ukey(fn)
	108	files = w.ls("/user/testuser/testrun", True)
	109	summ = w.content_summary("/user/testuser/testrun")
	110	assert summ["length"] == files[0]["size"]
	111	assert summ["fileCount"] == 1
	112
	113	w.rm("/user/testuser/testrun", recursive=True)
	114	assert not w.exists(fn)

+49

-0

fsspec/implementations/tests/test_zip.py less more

	0	import zipfile
	1	from contextlib import contextmanager
	2	import os
	3	import pickle
	4	import pytest
	5	import sys
	6	import tempfile
	7	import fsspec
	8
	9
	10	@contextmanager
	11	def tempzip(data={}):
	12	f = tempfile.mkstemp(suffix="zip")[1]
	13	with zipfile.ZipFile(f, mode="w") as z:
	14	for k, v in data.items():
	15	z.writestr(k, v)
	16	try:
	17	yield f
	18	finally:
	19	try:
	20	os.remove(f)
	21	except (IOError, OSError):
	22	pass
	23
	24
	25	data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
	26
	27
	28	def test_empty():
	29	with tempzip() as z:
	30	fs = fsspec.get_filesystem_class("zip")(fo=z)
	31	assert fs.find("") == []
	32
	33
	34	@pytest.mark.xfail(sys.version_info < (3, 6), reason="zip-info odd on py35")
	35	def test_mapping():
	36	with tempzip(data) as z:
	37	fs = fsspec.get_filesystem_class("zip")(fo=z)
	38	m = fs.get_mapper("")
	39	assert list(m) == ["a", "b", "deeply/nested/path"]
	40	assert m["b"] == data["b"]
	41
	42
	43	@pytest.mark.xfail(sys.version_info < (3, 6), reason="zip not supported on py35")
	44	def test_pickle():
	45	with tempzip(data) as z:
	46	fs = fsspec.get_filesystem_class("zip")(fo=z)
	47	fs2 = pickle.loads(pickle.dumps(fs))
	48	assert fs2.cat("b") == b"hello"

+384

-0

fsspec/implementations/webhdfs.py less more

	0	# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
	1
	2	import requests
	3	from urllib.parse import quote
	4	import uuid
	5	from ..spec import AbstractFileSystem, AbstractBufferedFile
	6	from ..utils import infer_storage_options
	7	import logging
	8
	9	logger = logging.getLogger("webhdfs")
	10
	11
	12	class WebHDFS(AbstractFileSystem):
	13	"""
	14	Interface to HDFS over HTTP
	15
	16	Three auth mechanisms are supported:
	17
	18	insecure: no auth is done, and the user is assumed to be whoever they
	19	say they are (parameter `user`), or a predefined value such as
	20	"dr.who" if not given
	21	spnego: when kerberos authentication is enabled, auth is negotiated by
	22	requests_kerberos https://github.com/requests/requests-kerberos .
	23	This establishes a session based on existing kinit login and/or
	24	specified principal/password; paraneters are passed with ``kerb_kwargs``
	25	token: uses an existing Hadoop delegation token from another secured
	26	service. Indeed, this client can also generate such tokens when
	27	not insecure. Note that tokens expire, but can be renewed (by a
	28	previously specified user) and may allow for proxying.
	29
	30	"""
	31
	32	tempdir = "/tmp"
	33	protocol = "webhdfs", "webHDFS"
	34
	35	def __init__(
	36	self,
	37	host,
	38	port=50070,
	39	kerberos=False,
	40	token=None,
	41	user=None,
	42	proxy_to=None,
	43	kerb_kwargs=None,
	44	data_proxy=None,
	45	**kwargs
	46	):
	47	"""
	48	Parameters
	49	----------
	50	host: str
	51	Name-node address
	52	port: int
	53	Port for webHDFS
	54	kerberos: bool
	55	Whether to authenticate with kerberos for this connection
	56	token: str or None
	57	If given, use this token on every call to authenticate. A user
	58	and user-proxy may be encoded in the token and should not be also
	59	given
	60	user: str or None
	61	If given, assert the user name to connect with
	62	proxy_to: str or None
	63	If given, the user has the authority to proxy, and this value is
	64	the user in who's name actions are taken
	65	kerb_kwargs: dict
	66	Any extra arguments for HTTPKerberosAuth, see
	67	https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py
	68	data_proxy: dict, callable or None
	69	If given, map data-node addresses. This can be necessary if the
	70	HDFS cluster is behind a proxy, running on Docker or otherwise has
	71	a mismatch between the host-names given by the name-node and the
	72	address by which to refer to them from the client. If a dict,
	73	maps host names `host->data_proxy[host]`; if a callable, full
	74	URLs are passed, and function must conform to
	75	`url->data_proxy(url)`.
	76	kwargs
	77	"""
	78	if self._cached:
	79	return
	80	super().__init__(**kwargs)
	81	self.url = "http://{host}:{port}/webhdfs/v1".format(host=host, port=port)
	82	self.kerb = kerberos
	83	self.kerb_kwargs = kerb_kwargs or {}
	84	self.pars = {}
	85	self.proxy = data_proxy or {}
	86	if token is not None:
	87	if user is not None or proxy_to is not None:
	88	raise ValueError(
	89	"If passing a delegation token, must not set "
	90	"user or proxy_to, as these are encoded in the"
	91	" token"
	92	)
	93	self.pars["delegation"] = token
	94	if user is not None:
	95	self.pars["user.name"] = user
	96	if proxy_to is not None:
	97	self.pars["doas"] = proxy_to
	98	if kerberos and user is not None:
	99	raise ValueError(
	100	"If using Kerberos auth, do not specify the "
	101	"user, this is handled by kinit."
	102	)
	103	self._connect()
	104
	105	def _connect(self):
	106	self.session = requests.Session()
	107	if self.kerb:
	108	from requests_kerberos import HTTPKerberosAuth
	109
	110	self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
	111
	112	def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
	113	url = self.url + quote(path or "")
	114	args = kwargs.copy()
	115	args.update(self.pars)
	116	args["op"] = op.upper()
	117	logger.debug(url, method, args)
	118	out = self.session.request(
	119	method=method.upper(),
	120	url=url,
	121	params=args,
	122	data=data,
	123	allow_redirects=redirect,
	124	)
	125	if out.status_code == 404:
	126	raise FileNotFoundError(path)
	127	if out.status_code == 403:
	128	raise PermissionError(path or "")
	129	if out.status_code == 401:
	130	raise PermissionError # not specific to path
	131	out.raise_for_status()
	132	return out
	133
	134	def _open(
	135	self,
	136	path,
	137	mode="rb",
	138	block_size=None,
	139	autocommit=True,
	140	replication=None,
	141	permissions=None,
	142	**kwargs
	143	):
	144	"""
	145
	146	Parameters
	147	----------
	148	path: str
	149	File location
	150	mode: str
	151	'rb', 'wb', etc.
	152	block_size: int
	153	Client buffer size for read-ahead or write buffer
	154	autocommit: bool
	155	If False, writes to temporary file that only gets put in final
	156	location upon commit
	157	replication: int
	158	Number of copies of file on the cluster, write mode only
	159	permissions: str or int
	160	posix permissions, write mode only
	161	kwargs
	162
	163	Returns
	164	-------
	165	WebHDFile instance
	166	"""
	167	block_size = block_size or self.blocksize
	168	return WebHDFile(
	169	self,
	170	path,
	171	mode=mode,
	172	block_size=block_size,
	173	tempdir=self.tempdir,
	174	autocommit=autocommit,
	175	replication=replication,
	176	permissions=permissions,
	177	)
	178
	179	@staticmethod
	180	def _process_info(info):
	181	info["type"] = info["type"].lower()
	182	info["size"] = info["length"]
	183	return info
	184
	185	@classmethod
	186	def _strip_protocol(cls, path):
	187	return infer_storage_options(path)["path"]
	188
	189	@staticmethod
	190	def _get_kwargs_from_urls(urlpath):
	191	out = infer_storage_options(urlpath)
	192	out.pop("path", None)
	193	out.pop("protocol", None)
	194	if "username" in out:
	195	out["user"] = out.pop("username")
	196	return out
	197
	198	def info(self, path):
	199	out = self._call("GETFILESTATUS", path=path)
	200	info = out.json()["FileStatus"]
	201	info["name"] = path
	202	return self._process_info(info)
	203
	204	def ls(self, path, detail=False):
	205	out = self._call("LISTSTATUS", path=path)
	206	infos = out.json()["FileStatuses"]["FileStatus"]
	207	for info in infos:
	208	self._process_info(info)
	209	info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
	210	if detail:
	211	return sorted(infos, key=lambda i: i["name"])
	212	else:
	213	return sorted(info["name"] for info in infos)
	214
	215	def content_summary(self, path):
	216	"""Total numbers of files, directories and bytes under path"""
	217	out = self._call("GETCONTENTSUMMARY", path=path)
	218	return out.json()["ContentSummary"]
	219
	220	def ukey(self, path):
	221	"""Checksum info of file, giving method and result"""
	222	out = self._call("GETFILECHECKSUM", path=path, redirect=False)
	223	location = self._apply_proxy(out.headers["Location"])
	224	out2 = self.session.get(location)
	225	out2.raise_for_status()
	226	return out2.json()["FileChecksum"]
	227
	228	def home_directory(self):
	229	"""Get user's home directory"""
	230	out = self._call("GETHOMEDIRECTORY")
	231	return out.json()["Path"]
	232
	233	def get_delegation_token(self, renewer=None):
	234	"""Retrieve token which can give the same authority to other uses
	235
	236	Parameters
	237	----------
	238	renewer: str or None
	239	User who may use this token; if None, will be current user
	240	"""
	241	if renewer:
	242	out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
	243	else:
	244	out = self._call("GETDELEGATIONTOKEN")
	245	t = out.json()["Token"]
	246	if t is None:
	247	raise ValueError("No token available for this user/security context")
	248	return t["urlString"]
	249
	250	def renew_delegation_token(self, token):
	251	"""Make token live longer. Returns new expiry time"""
	252	out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
	253	return out.json()["long"]
	254
	255	def cancel_delegation_token(self, token):
	256	"""Stop the token from being useful"""
	257	self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
	258
	259	def chmod(self, path, mod):
	260	"""Set the permission at path
	261
	262	Parameters
	263	----------
	264	path: str
	265	location to set (file or directory)
	266	mod: str or int
	267	posix epresentation or permission, give as oct string, e.g, '777'
	268	or 0o777
	269	"""
	270	self._call("SETPERMISSION", method="put", path=path, permission=mod)
	271
	272	def chown(self, path, owner=None, group=None):
	273	"""Change owning user and/or group"""
	274	kwargs = {}
	275	if owner is not None:
	276	kwargs["owner"] = owner
	277	if group is not None:
	278	kwargs["group"] = group
	279	self._call("SETOWNER", method="put", path=path, **kwargs)
	280
	281	def set_replication(self, path, replication):
	282	"""
	283	Set file replication factor
	284
	285	Parameters
	286	----------
	287	path: str
	288	File location (not for directories)
	289	replication: int
	290	Number of copies of file on the cluster. Should be smaller than
	291	number of data nodes; normally 3 on most systems.
	292	"""
	293	self._call("SETREPLICATION", path=path, method="put", replication=replication)
	294
	295	def mkdir(self, path, **kwargs):
	296	self._call("MKDIRS", method="put", path=path)
	297
	298	def makedirs(self, path, exist_ok=False):
	299	if exist_ok is False and self.exists(path):
	300	raise FileExistsError(path)
	301	self.mkdir(path)
	302
	303	def mv(self, path1, path2, **kwargs):
	304	self._call("RENAME", method="put", path=path1, destination=path2)
	305
	306	def rm(self, path, recursive=False, **kwargs):
	307	self._call(
	308	"DELETE",
	309	method="delete",
	310	path=path,
	311	recursive="true" if recursive else "false",
	312	)
	313
	314	def _apply_proxy(self, location):
	315	if self.proxy and callable(self.proxy):
	316	location = self.proxy(location)
	317	elif self.proxy:
	318	# as a dict
	319	for k, v in self.proxy.items():
	320	location = location.replace(k, v, 1)
	321	return location
	322
	323
	324	class WebHDFile(AbstractBufferedFile):
	325	"""A file living in HDFS over webHDFS"""
	326
	327	def __init__(self, fs, path, **kwargs):
	328	super().__init__(fs, path, **kwargs)
	329	kwargs = kwargs.copy()
	330	if kwargs.get("permissions", None) is None:
	331	kwargs.pop("permissions", None)
	332	if kwargs.get("replication", None) is None:
	333	kwargs.pop("replication", None)
	334	self.permissions = kwargs.pop("permissions", 511)
	335	tempdir = kwargs.pop("tempdir")
	336	if kwargs.pop("autocommit", False) is False:
	337	self.target = self.path
	338	self.path = "/".join([tempdir, str(uuid.uuid4())])
	339
	340	def _upload_chunk(self, final=False):
	341	""" Write one part of a multi-block file upload
	342
	343	Parameters
	344	==========
	345	final: bool
	346	This is the last block, so should complete file, if
	347	self.autocommit is True.
	348	"""
	349	out = self.fs.session.post(self.location, data=self.buffer.getvalue())
	350	out.raise_for_status()
	351	return True
	352
	353	def _initiate_upload(self):
	354	""" Create remote file/upload """
	355	if "a" in self.mode:
	356	op, method = "APPEND", "POST"
	357	else:
	358	op, method = "CREATE", "PUT"
	359	if self.fs.exists(self.path):
	360	# no "truncate" or "create empty"
	361	self.fs.rm(self.path)
	362	out = self.fs._call(op, method, self.path, redirect=False, **self.kwargs)
	363	location = self.fs._apply_proxy(out.headers["Location"])
	364	if "w" in self.mode:
	365	# create empty file to append to
	366	out2 = self.fs.session.put(location)
	367	out2.raise_for_status()
	368	self.location = location.replace("CREATE", "APPEND")
	369
	370	def _fetch_range(self, start, end):
	371	out = self.fs._call(
	372	"OPEN", path=self.path, offset=start, length=end - start, redirect=False
	373	)
	374	out.raise_for_status()
	375	location = out.headers["Location"]
	376	out2 = self.fs.session.get(self.fs._apply_proxy(location))
	377	return out2.content
	378
	379	def commit(self):
	380	self.fs.mv(self.path, self.target)
	381
	382	def discard(self):
	383	self.fs.rm(self.path)

+119

-0

fsspec/implementations/zip.py less more

	0	from __future__ import print_function, division, absolute_import
	1
	2	import zipfile
	3	from fsspec import AbstractFileSystem, open_files
	4	from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE
	5
	6
	7	class ZipFileSystem(AbstractFileSystem):
	8	"""Read contents of ZIP archive as a file-system
	9
	10	Keeps file object open while instance lives.
	11
	12	This class is pickleable, but not necessarily thread-safe
	13	"""
	14
	15	root_marker = ""
	16
	17	def __init__(self, fo="", mode="r", **storage_options):
	18	"""
	19	Parameters
	20	----------
	21	fo: str or file-like
	22	Contains ZIP, and must exist. If a str, will fetch file using
	23	`open_files()`, which must return one file exactly.
	24	mode: str
	25	Currently, only 'r' accepted
	26	storage_options: key-value
	27	May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
	28	other parameters for requests
	29	"""
	30	if self._cached:
	31	return
	32	AbstractFileSystem.__init__(self)
	33	if mode != "r":
	34	raise ValueError("Only read from zip files accepted")
	35	self.in_fo = fo
	36	if isinstance(fo, str):
	37	files = open_files(fo)
	38	if len(files) != 1:
	39	raise ValueError(
	40	'Path "{}" did not resolve to exactly'
	41	'one file: "{}"'.format(fo, files)
	42	)
	43	fo = files[0]
	44	self.fo = fo.__enter__() # the whole instance is a context
	45	self.zip = zipfile.ZipFile(self.fo)
	46	self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE)
	47	self.dir_cache = None
	48
	49	@classmethod
	50	def _strip_protocol(cls, path):
	51	# zip file paths are always relative to the archive root
	52	return super()._strip_protocol(path).lstrip("/")
	53
	54	def _get_dirs(self):
	55	if self.dir_cache is None:
	56	files = self.zip.infolist()
	57	self.dir_cache = {}
	58	for z in files:
	59	f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__}
	60	f.update(
	61	{
	62	"name": z.filename,
	63	"size": z.file_size,
	64	"type": ("directory" if z.is_dir() else "file"),
	65	}
	66	)
	67	self.dir_cache[f["name"]] = f
	68
	69	def ls(self, path, detail=False):
	70	self._get_dirs()
	71	paths = {}
	72	for p, f in self.dir_cache.items():
	73	p = p.rstrip("/")
	74	if "/" in p:
	75	root = p.rsplit("/", 1)[0]
	76	else:
	77	root = ""
	78	if root == path.rstrip("/"):
	79	paths[p] = f
	80	elif path and all(
	81	(a == b) for a, b in zip(path.split("/"), p.strip("/").split("/"))
	82	):
	83	# implicit directory
	84	ppath = "/".join(p.split("/")[: len(path.split("/")) + 1])
	85	if ppath not in paths:
	86	out = {"name": ppath + "/", "size": 0, "type": "directory"}
	87	paths[ppath] = out
	88
	89	elif all(
	90	(a == b)
	91	for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
	92	):
	93	# root directory entry
	94	ppath = p.rstrip("/").split("/", 1)[0]
	95	if ppath not in paths:
	96	out = {"name": ppath + "/", "size": 0, "type": "directory"}
	97	paths[ppath] = out
	98	out = list(paths.values())
	99	if detail:
	100	return out
	101	else:
	102	return list(sorted(f["name"] for f in out))
	103
	104	def cat(self, path):
	105	return self.zip.read(path)
	106
	107	def _open(self, path, mode="rb", **kwargs):
	108	path = self._strip_protocol(path)
	109	if mode != "rb":
	110	raise NotImplementedError
	111	info = self.info(path)
	112	out = self.zip.open(path, "r")
	113	out.size = info["size"]
	114	out.name = info["name"]
	115	return out
	116
	117	def ukey(self, path):
	118	return tokenize(path, self.in_fo, self.protocol)

+152

-0

fsspec/mapping.py less more

	0	from collections.abc import MutableMapping
	1	from .registry import get_filesystem_class
	2	from .core import split_protocol
	3
	4
	5	class FSMap(MutableMapping):
	6	"""Wrap a FileSystem instance as a mutable wrapping.
	7
	8	The keys of the mapping become files under the given root, and the
	9	values (which must be bytes) the contents of those files.
	10
	11	Parameters
	12	----------
	13	root: string
	14	prefix for all the files
	15	fs: FileSystem instance
	16	check: bool (=True)
	17	performs a touch at the location, to check for write access.
	18
	19	Examples
	20	--------
	21	>>> fs = FileSystem(**parameters) # doctest: +SKIP
	22	>>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
	23	or, more likely
	24	>>> d = fs.get_mapper('my-data/path/')
	25
	26	>>> d['loc1'] = b'Hello World' # doctest: +SKIP
	27	>>> list(d.keys()) # doctest: +SKIP
	28	['loc1']
	29	>>> d['loc1'] # doctest: +SKIP
	30	b'Hello World'
	31	"""
	32
	33	def __init__(self, root, fs, check=False, create=False):
	34	self.fs = fs
	35	self.root = fs._strip_protocol(root).rstrip(
	36	"/"
	37	) # we join on '/' in _key_to_str
	38	if create:
	39	if not self.fs.exists(root):
	40	self.fs.mkdir(root)
	41	if check:
	42	if not self.fs.exists(root):
	43	raise ValueError(
	44	"Path %s does not exist. Create "
	45	" with the ``create=True`` keyword" % root
	46	)
	47	self.fs.touch(root + "/a")
	48	self.fs.rm(root + "/a")
	49
	50	def clear(self):
	51	"""Remove all keys below root - empties out mapping
	52	"""
	53	try:
	54	self.fs.rm(self.root, True)
	55	self.fs.mkdir(self.root)
	56	except: # noqa: E722
	57	pass
	58
	59	def _key_to_str(self, key):
	60	"""Generate full path for the key"""
	61	if isinstance(key, (tuple, list)):
	62	key = str(tuple(key))
	63	else:
	64	key = str(key)
	65	return "/".join([self.root, key]) if self.root else key
	66
	67	def _str_to_key(self, s):
	68	"""Strip path of to leave key name"""
	69	return s[len(self.root) :].lstrip("/")
	70
	71	def __getitem__(self, key, default=None):
	72	"""Retrieve data"""
	73	key = self._key_to_str(key)
	74	try:
	75	result = self.fs.cat(key)
	76	except: # noqa: E722
	77	if default is not None:
	78	return default
	79	raise KeyError(key)
	80	return result
	81
	82	def pop(self, key, default=None):
	83	result = self.__getitem__(key, default)
	84	try:
	85	del self[key]
	86	except KeyError:
	87	pass
	88	return result
	89
	90	def __setitem__(self, key, value):
	91	"""Store value in key"""
	92	key = self._key_to_str(key)
	93	self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
	94	with self.fs.open(key, "wb") as f:
	95	f.write(value)
	96
	97	def __iter__(self):
	98	return (self._str_to_key(x) for x in self.fs.find(self.root))
	99
	100	def __len__(self):
	101	return len(self.fs.find(self.root))
	102
	103	def __delitem__(self, key):
	104	"""Remove key"""
	105	try:
	106	self.fs.rm(self._key_to_str(key))
	107	except: # noqa: E722
	108	raise KeyError
	109
	110	def __contains__(self, key):
	111	"""Does key exist in mapping?"""
	112	return self.fs.exists(self._key_to_str(key))
	113
	114	def __getstate__(self):
	115	"""Mapping should be pickleable"""
	116	# TODO: replace with reduce to reinstantiate?
	117	return self.fs, self.root
	118
	119	def __setstate__(self, state):
	120	fs, root = state
	121	self.fs = fs
	122	self.root = root
	123
	124
	125	def get_mapper(url, check=False, create=False, **kwargs):
	126	"""Create key-value interface for given URL and options
	127
	128	The URL will be of the form "protocol://location" and point to the root
	129	of the mapper required. All keys will be file-names below this location,
	130	and their values the contents of each key.
	131
	132	Parameters
	133	----------
	134	url: str
	135	Root URL of mapping
	136	check: bool
	137	Whether to attempt to read from the location before instantiation, to
	138	check that the mapping does exist
	139	create: bool
	140	Whether to make the directory corresponding to the root before
	141	instantiating
	142
	143	Returns
	144	-------
	145	``FSMap`` instance, the dict-like key-value store.
	146	"""
	147	protocol, path = split_protocol(url)
	148	cls = get_filesystem_class(protocol)
	149	fs = cls(**kwargs)
	150	# Removing protocol here - could defer to each open() on the backend
	151	return FSMap(url, fs, check, create)

+116

-0

fsspec/registry.py less more

	0	import importlib
	1	from distutils.version import LooseVersion
	2
	3	__all__ = ["registry", "get_filesystem_class", "default"]
	4
	5	# mapping protocol: implementation class object
	6	registry = {}
	7	default = "file"
	8
	9	# protocols mapped to the class which implements them. This dict can
	10	# be dynamically updated.
	11	known_implementations = {
	12	"file": {"class": "fsspec.implementations.local.LocalFileSystem"},
	13	"memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
	14	"http": {
	15	"class": "fsspec.implementations.http.HTTPFileSystem",
	16	"err": 'HTTPFileSystem requires "requests" to be installed',
	17	},
	18	"https": {
	19	"class": "fsspec.implementations.http.HTTPFileSystem",
	20	"err": 'HTTPFileSystem requires "requests" to be installed',
	21	},
	22	"zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
	23	"gcs": {
	24	"class": "gcsfs.GCSFileSystem",
	25	"err": "Please install gcsfs to access Google Storage",
	26	},
	27	"gs": {
	28	"class": "gcsfs.GCSFileSystem",
	29	"err": "Please install gcsfs to access Google Storage",
	30	},
	31	"sftp": {
	32	"class": "fsspec.implementations.sftp.SFTPFileSystem",
	33	"err": 'SFTPFileSystem requires "paramiko" to be installed',
	34	},
	35	"ssh": {
	36	"class": "fsspec.implementations.sftp.SFTPFileSystem",
	37	"err": 'SFTPFileSystem requires "paramiko" to be installed',
	38	},
	39	"ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
	40	"hdfs": {
	41	"class": "fsspec.implementations.hdfs.PyArrowHDFS",
	42	"err": "pyarrow and local java libraries required for HDFS",
	43	},
	44	"webhdfs": {
	45	"class": "fsspec.implementations.webhdfs.WebHDFS",
	46	"err": 'webHDFS access requires "requests" to be installed',
	47	},
	48	"s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
	49	"cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
	50	"blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
	51	"filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
	52	"dask": {
	53	"class": "fsspec.implementations.dask.DaskWorkerFileSystem",
	54	"err": "Install dask distributed to access worker file system",
	55	},
	56	}
	57
	58	minversions = {"s3fs": LooseVersion("0.3.0"), "gcsfs": LooseVersion("0.3.0")}
	59
	60
	61	def get_filesystem_class(protocol):
	62	"""Fetch named protocol implementation from the registry
	63
	64	The dict ``known_implementations`` maps protocol names to the locations
	65	of classes implementing the corresponding file-system. When used for the
	66	first time, appropriate imports will happen and the class will be placed in
	67	the registry. All subsequent calls will fetch directly from the registry.
	68
	69	Some protocol implementations require additional dependencies, and so the
	70	import may fail. In this case, the string in the "err" field of the
	71	``known_implementations`` will be given as the error message.
	72	"""
	73	if protocol is None:
	74	protocol = default
	75
	76	if protocol not in registry:
	77	if protocol not in known_implementations:
	78	raise ValueError("Protocol not known: %s" % protocol)
	79	bit = known_implementations[protocol]
	80	mod, name = bit["class"].rsplit(".", 1)
	81	minversion = minversions.get(mod, None)
	82	err = None
	83	try:
	84	mod = importlib.import_module(mod)
	85	except ImportError:
	86	err = ImportError(bit["err"])
	87
	88	except Exception as e:
	89	err = e
	90	if err is not None:
	91	raise RuntimeError(str(err))
	92
	93	if minversion:
	94	version = getattr(mod, "__version__", None)
	95	if version and LooseVersion(version) < minversion:
	96	raise RuntimeError(
	97	"'{}={}' is installed, but version '{}' or "
	98	"higher is required".format(mod.__name__, version, minversion)
	99	)
	100	registry[protocol] = getattr(mod, name)
	101	cls = registry[protocol]
	102	if getattr(cls, "protocol", None) in ("abstract", None):
	103	cls.protocol = protocol
	104
	105	return cls
	106
	107
	108	def filesystem(protocol, **storage_options):
	109	"""Instantiate filesystems for given protocol and arguments
	110
	111	``storage_options`` are specific to the protocol being chosen, and are
	112	passed directly to the class.
	113	"""
	114	cls = get_filesystem_class(protocol)
	115	return cls(**storage_options)

+1246

-0

fsspec/spec.py less more

	0	import warnings
	1	from hashlib import md5
	2	import io
	3	import os
	4	import logging
	5
	6	from .transaction import Transaction
	7	from .utils import read_block, tokenize, stringify_path
	8
	9	logger = logging.getLogger("fsspec")
	10
	11
	12	def make_instance(cls, args, kwargs):
	13	return cls(args, *kwargs)
	14
	15
	16	class _Cached(type):
	17	"""
	18	Metaclass for caching file system instances.
	19
	20	Notes
	21	-----
	22	Instances are cached according to
	23
	24	* The values of the class attributes listed in `_extra_tokenize_attributes`
	25	* The arguments passed to ``__init__``.
	26
	27	This creates an additional reference to the filesystem, which prevents the
	28	filesystem from being garbage collected when all user references go away.
	29	A call to the :meth:`AbstractFileSystem.clear_instance_cache` must also
	30	be made for a filesystem instance to be garbage collected.
	31	"""
	32
	33	cachable = True
	34	_extra_tokenize_attributes = ()
	35
	36	def __init__(self, args, *kwargs):
	37	super().__init__(args, *kwargs)
	38	# Note: we intentionally create a reference here, to avoid garbage
	39	# collecting instances when all other references are gone. To really
	40	# delete a FileSystem, the cache must be cleared.
	41	self._cache = {}
	42
	43	def __call__(self, args, *kwargs):
	44	cls = type(self)
	45	extra_tokens = tuple(
	46	getattr(self, attr, None) for attr in self._extra_tokenize_attributes
	47	)
	48	token = tokenize(cls, args, extra_tokens, **kwargs)
	49	if self.cachable and token in self._cache:
	50	return self._cache[token]
	51	else:
	52	obj = super().__call__(args, *kwargs)
	53	# Setting _fs_token here causes some static linters to complain.
	54	obj._fs_token_ = token
	55	self.storage_args = args
	56	self.storage_options = kwargs
	57
	58	if self.cachable:
	59	self._cache[token] = obj
	60	return obj
	61
	62
	63	try: # optionally derive from pyarrow's FileSystem, if available
	64	import pyarrow as pa
	65
	66	up = pa.filesystem.DaskFileSystem
	67	except ImportError:
	68	up = object
	69
	70
	71	class AbstractFileSystem(up, metaclass=_Cached):
	72	"""
	73	An abstract super-class for pythonic file-systems
	74
	75	Implementations are expected to be compatible with or, better, subclass
	76	from here.
	77	"""
	78
	79	cachable = True # this class can be cached, instances reused
	80	_cached = False
	81	blocksize = 2 ** 22
	82	sep = "/"
	83	protocol = "abstract"
	84	root_marker = "" # For some FSs, may require leading '/' or other character
	85
	86	#: Extra class attributes that should be considered when hashing.
	87	_extra_tokenize_attributes = ()
	88
	89	def __init__(self, args, *storage_options):
	90	"""Create and configure file-system instance
	91
	92	Instances may be cachable, so if similar enough arguments are seen
	93	a new instance is not required. The token attribute exists to allow
	94	implementations to cache instances if they wish.
	95
	96	A reasonable default should be provided if there are no arguments.
	97
	98	Subclasses should call this method.
	99
	100	Magic kwargs that affect functionality here:
	101	add_docs: if True, will append docstrings from this spec to the
	102	specific implementation
	103	"""
	104	if self._cached:
	105	# reusing instance, don't change
	106	return
	107	self._cached = True
	108	self._intrans = False
	109	self._transaction = None
	110	self.dircache = {}
	111
	112	if storage_options.pop("add_docs", None):
	113	warnings.warn("add_docs is no longer supported.", FutureWarning)
	114
	115	if storage_options.pop("add_aliases", None):
	116	warnings.warn("add_aliases has been removed.", FutureWarning)
	117	# This is set in _Cached
	118	self._fs_token_ = None
	119
	120	@property
	121	def _fs_token(self):
	122	return self._fs_token_
	123
	124	def __dask_tokenize__(self):
	125	return self._fs_token
	126
	127	def __hash__(self):
	128	return int(self._fs_token, 16)
	129
	130	def __eq__(self, other):
	131	return isinstance(other, type(self)) and self._fs_token == other._fs_token
	132
	133	@classmethod
	134	def _strip_protocol(cls, path):
	135	""" Turn path from fully-qualified to file-system-specific
	136
	137	May require FS-specific handling, e.g., for relative paths or links.
	138	"""
	139	path = stringify_path(path)
	140	protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol
	141	for protocol in protos:
	142	path = path.rstrip("/")
	143	if path.startswith(protocol + "://"):
	144	path = path[len(protocol) + 3 :]
	145	elif path.startswith(protocol + ":"):
	146	path = path[len(protocol) + 1 :]
	147	# use of root_marker to make minimum required path, e.g., "/"
	148	return path or cls.root_marker
	149
	150	@staticmethod
	151	def _get_kwargs_from_urls(paths):
	152	"""If kwargs can be encoded in the paths, extract them here
	153
	154	This should happen before instantiation of the class; incoming paths
	155	then should be amended to strip the options in methods.
	156
	157	Examples may look like an sftp path "sftp://user@host:/my/path", where
	158	the user and host should become kwargs and later get stripped.
	159	"""
	160	# by default, nothing happens
	161	return {}
	162
	163	@classmethod
	164	def current(cls):
	165	""" Return the most recently created FileSystem
	166
	167	If no instance has been created, then create one with defaults
	168	"""
	169	if not len(cls._cache):
	170	return cls()
	171	else:
	172	return list(cls._cache.values())[-1]
	173
	174	@property
	175	def transaction(self):
	176	"""A context within which files are committed together upon exit
	177
	178	Requires the file class to implement `.commit()` and `.discard()`
	179	for the normal and exception cases.
	180	"""
	181	if self._transaction is None:
	182	self._transaction = Transaction(self)
	183	return self._transaction
	184
	185	def start_transaction(self):
	186	"""Begin write transaction for deferring files, non-context version"""
	187	self._intrans = True
	188	self._transaction = Transaction(self)
	189	return self.transaction
	190
	191	def end_transaction(self):
	192	"""Finish write transaction, non-context version"""
	193	self.transaction.complete()
	194	self._transaction = None
	195
	196	def invalidate_cache(self, path=None):
	197	"""
	198	Discard any cached directory information
	199
	200	Parameters
	201	----------
	202	path: string or None
	203	If None, clear all listings cached else listings at or under given
	204	path.
	205	"""
	206	pass # not necessary to implement, may have no cache
	207
	208	def mkdir(self, path, create_parents=True, **kwargs):
	209	"""
	210	Create directory entry at path
	211
	212	For systems that don't have true directories, may create an for
	213	this instance only and not touch the real filesystem
	214
	215	Parameters
	216	----------
	217	path: str
	218	location
	219	create_parents: bool
	220	if True, this is equivalent to ``makedirs``
	221	kwargs:
	222	may be permissions, etc.
	223	"""
	224	pass # not necessary to implement, may not have directories
	225
	226	def makedirs(self, path, exist_ok=False):
	227	"""Recursively make directories
	228
	229	Creates directory at path and any intervening required directories.
	230	Raises exception if, for instance, the path already exists but is a
	231	file.
	232
	233	Parameters
	234	----------
	235	path: str
	236	leaf directory name
	237	exist_ok: bool (False)
	238	If True, will error if the target already exists
	239	"""
	240	pass # not necessary to implement, may not have directories
	241
	242	def rmdir(self, path):
	243	"""Remove a directory, if empty"""
	244	pass # not necessary to implement, may not have directories
	245
	246	def ls(self, path, detail=True, **kwargs):
	247	"""List objects at path.
	248
	249	This should include subdirectories and files at that location. The
	250	difference between a file and a directory must be clear when details
	251	are requested.
	252
	253	The specific keys, or perhaps a FileInfo class, or similar, is TBD,
	254	but must be consistent across implementations.
	255	Must include:
	256	- full path to the entry (without protocol)
	257	- size of the entry, in bytes. If the value cannot be determined, will
	258	be ``None``.
	259	- type of entry, "file", "directory" or other
	260
	261	Additional information
	262	may be present, aproriate to the file-system, e.g., generation,
	263	checksum, etc.
	264
	265	May use refresh=True\|False to allow use of self._ls_from_cache to
	266	check for a saved listing and avoid calling the backend. This would be
	267	common where listing may be expensive.
	268
	269	Parameters
	270	----------
	271	path: str
	272	detail: bool
	273	if True, gives a list of dictionaries, where each is the same as
	274	the result of ``info(path)``. If False, gives a list of paths
	275	(str).
	276	kwargs: may have additional backend-specific options, such as version
	277	information
	278
	279	Returns
	280	-------
	281	List of strings if detail is False, or list of directory information
	282	dicts if detail is True.
	283	"""
	284	raise NotImplementedError
	285
	286	def _ls_from_cache(self, path):
	287	"""Check cache for listing
	288
	289	Returns listing, if found (may me empty list for a directly that exists
	290	but contains nothing), None if not in cache.
	291	"""
	292	parent = self._parent(path)
	293	if path in self.dircache:
	294	return self.dircache[path]
	295	elif parent in self.dircache:
	296	files = [f for f in self.dircache[parent] if f["name"] == path]
	297	if len(files) == 0:
	298	# parent dir was listed but did not contain this file
	299	raise FileNotFoundError(path)
	300	return files
	301
	302	def walk(self, path, maxdepth=None, **kwargs):
	303	""" Return all files belows path
	304
	305	List all files, recursing into subdirectories; output is iterator-style,
	306	like ``os.walk()``. For a simple list of files, ``find()`` is available.
	307
	308	Note that the "files" outputted will include anything that is not
	309	a directory, such as links.
	310
	311	Parameters
	312	----------
	313	path: str
	314	Root to recurse into
	315	maxdepth: int
	316	Maximum recursion depth. None means limitless, but not recommended
	317	on link-based file-systems.
	318	kwargs: passed to ``ls``
	319	"""
	320	path = self._strip_protocol(path)
	321	full_dirs = []
	322	dirs = []
	323	files = []
	324
	325	try:
	326	listing = self.ls(path, detail=True, **kwargs)
	327	except (FileNotFoundError, IOError):
	328	return [], [], []
	329
	330	for info in listing:
	331	# each info name must be at least [path]/part , but here
	332	# we check also for names like [path]/part/
	333	name = info["name"].rstrip("/")
	334	if info["type"] == "directory" and name != path:
	335	# do not include "self" path
	336	full_dirs.append(name)
	337	dirs.append(name.rsplit("/", 1)[-1])
	338	elif name == path:
	339	# file-like with same name as give path
	340	files.append("")
	341	else:
	342	files.append(name.rsplit("/", 1)[-1])
	343	yield path, dirs, files
	344
	345	for d in full_dirs:
	346	if maxdepth is None or maxdepth > 1:
	347	for res in self.walk(
	348	d,
	349	maxdepth=(maxdepth - 1) if maxdepth is not None else None,
	350	**kwargs
	351	):
	352	yield res
	353
	354	def find(self, path, maxdepth=None, withdirs=False, **kwargs):
	355	"""List all files below path.
	356
	357	Like posix ``find`` command without conditions
	358
	359	Parameters
	360	----------
	361	path : str
	362	maxdepth: int or None
	363	If not None, the maximum number of levels to descend
	364	withdirs: bool
	365	Whether to include directory paths in the output. This is True
	366	when used by glob, but users usually only want files.
	367	kwargs are passed to ``ls``.
	368	"""
	369	# TODO: allow equivalent of -name parameter
	370	out = set()
	371	for path, dirs, files in self.walk(path, maxdepth, **kwargs):
	372	if withdirs:
	373	files += dirs
	374	for name in files:
	375	if name and name not in out:
	376	out.add("/".join([path.rstrip("/"), name]) if path else name)
	377	if self.isfile(path) and path not in out:
	378	# walk works on directories, but find should also return [path]
	379	# when path happens to be a file
	380	out.add(path)
	381	return sorted(out)
	382
	383	def du(self, path, total=True, maxdepth=None, **kwargs):
	384	"""Space used by files within a path
	385
	386	Parameters
	387	----------
	388	path: str
	389	total: bool
	390	whether to sum all the file sizes
	391	maxdepth: int or None
	392	maximum number of directory levels to descend, None for unlimited.
	393	kwargs: passed to ``ls``
	394
	395	Returns
	396	-------
	397	Dict of {fn: size} if total=False, or int otherwise, where numbers
	398	refer to bytes used.
	399	"""
	400	sizes = {}
	401	for f in self.find(path, maxdepth=maxdepth, **kwargs):
	402	info = self.info(f)
	403	sizes[info["name"]] = info["size"]
	404	if total:
	405	return sum(sizes.values())
	406	else:
	407	return sizes
	408
	409	def glob(self, path, **kwargs):
	410	"""
	411	Find files by glob-matching.
	412
	413	If the path ends with '/' and does not contain "*", it is essentially
	414	the same as ``ls(path)``, returning only files.
	415
	416	We support ``"**"``,
	417	``"?"`` and ``"[..]"``.
	418
	419	kwargs are passed to ``ls``.
	420	"""
	421	import re
	422	from glob import has_magic
	423
	424	ends = path.endswith("/")
	425	path = self._strip_protocol(path)
	426	indstar = path.find("") if path.find("") >= 0 else len(path)
	427	indques = path.find("?") if path.find("?") >= 0 else len(path)
	428	indbrace = path.find("[") if path.find("[") >= 0 else len(path)
	429
	430	ind = min(indstar, indques, indbrace)
	431
	432	if not has_magic(path):
	433	root = path
	434	depth = 1
	435	if ends:
	436	path += "/*"
	437	elif self.exists(path):
	438	return [path]
	439	else:
	440	return [] # glob of non-existent returns empty
	441	elif "/" in path[:ind]:
	442	ind2 = path[:ind].rindex("/")
	443	root = path[: ind2 + 1]
	444	depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1
	445	else:
	446	root = ""
	447	depth = 20 if "**" in path else 1
	448	allpaths = self.find(root, maxdepth=depth, withdirs=True, **kwargs)
	449	pattern = (
	450	"^"
	451	+ (
	452	path.replace("\\", r"\\")
	453	.replace(".", r"\.")
	454	.replace("+", r"\+")
	455	.replace("//", "/")
	456	.replace("(", r"\(")
	457	.replace(")", r"\)")
	458	.replace("\|", r"\\|")
	459	.rstrip("/")
	460	.replace("?", ".")
	461	)
	462	+ "$"
	463	)
	464	pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
	465	pattern = re.sub("[]", "[^/]", pattern)
	466	pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
	467	out = {p for p in allpaths if pattern.match(p.replace("//", "/").rstrip("/"))}
	468	return list(sorted(out))
	469
	470	def exists(self, path):
	471	"""Is there a file at the given path"""
	472	try:
	473	self.info(path)
	474	return True
	475	except: # noqa: E722
	476	# any exception allowed bar FileNotFoundError?
	477	return False
	478
	479	def info(self, path, **kwargs):
	480	"""Give details of entry at path
	481
	482	Returns a single dictionary, with exactly the same information as ``ls``
	483	would with ``detail=True``.
	484
	485	The default implementation should calls ls and could be overridden by a
	486	shortcut. kwargs are passed on to ```ls()``.
	487
	488	Some file systems might not be able to measure the file's size, in
	489	which case, the returned dict will include ``'size': None``.
	490
	491	Returns
	492	-------
	493	dict with keys: name (full path in the FS), size (in bytes), type (file,
	494	directory, or something else) and other FS-specific keys.
	495	"""
	496	path = self._strip_protocol(path)
	497	out = self.ls(self._parent(path), detail=True, **kwargs)
	498	out = [o for o in out if o["name"].rstrip("/") == path]
	499	if out:
	500	return out[0]
	501	out = self.ls(path, detail=True, **kwargs)
	502	path = path.rstrip("/")
	503	out1 = [o for o in out if o["name"].rstrip("/") == path]
	504	if len(out1) == 1:
	505	if "size" not in out1[0]:
	506	out1[0]["size"] = None
	507	return out1[0]
	508	elif len(out1) > 1 or out:
	509	return {"name": path, "size": 0, "type": "directory"}
	510	else:
	511	raise FileNotFoundError(path)
	512
	513	def checksum(self, path):
	514	"""Unique value for current version of file
	515
	516	If the checksum is the same from one moment to another, the contents
	517	are guaranteed to be the same. If the checksum changes, the contents
	518	might have changed.
	519
	520	This should normally be overridden; default will probably capture
	521	creation/modification timestamp (which would be good) or maybe
	522	access timestamp (which would be bad)
	523	"""
	524	return int(tokenize(self.info(path)), 16)
	525
	526	def size(self, path):
	527	"""Size in bytes of file"""
	528	return self.info(path).get("size", None)
	529
	530	def isdir(self, path):
	531	"""Is this entry directory-like?"""
	532	try:
	533	return self.info(path)["type"] == "directory"
	534	except FileNotFoundError:
	535	return False
	536
	537	def isfile(self, path):
	538	"""Is this entry file-like?"""
	539	try:
	540	return self.info(path)["type"] == "file"
	541	except: # noqa: E722
	542	return False
	543
	544	def cat(self, path):
	545	""" Get the content of a file """
	546	return self.open(path, "rb").read()
	547
	548	def get(self, rpath, lpath, recursive=False, **kwargs):
	549	"""Copy file to local.
	550
	551	Possible extension: maybe should be able to copy to any file-system
	552	(streaming through local).
	553	"""
	554	rpath = self._strip_protocol(rpath)
	555	if recursive:
	556	rpaths = self.find(rpath)
	557	lpaths = [
	558	os.path.join(lpath, path[len(rpath) :].lstrip("/")) for path in rpaths
	559	]
	560	for lpath in lpaths:
	561	dirname = os.path.dirname(lpath)
	562	if not os.path.isdir(dirname):
	563	os.makedirs(dirname)
	564	else:
	565	rpaths = [rpath]
	566	lpaths = [lpath]
	567	for lpath, rpath in zip(lpaths, rpaths):
	568	with self.open(rpath, "rb", **kwargs) as f1:
	569	with open(lpath, "wb") as f2:
	570	data = True
	571	while data:
	572	data = f1.read(self.blocksize)
	573	f2.write(data)
	574
	575	def put(self, lpath, rpath, recursive=False, **kwargs):
	576	""" Upload file from local """
	577	if recursive:
	578	lpaths = []
	579	for dirname, subdirlist, filelist in os.walk(lpath):
	580	lpaths += [os.path.join(dirname, filename) for filename in filelist]
	581	rootdir = os.path.basename(lpath.rstrip("/"))
	582	if self.exists(rpath):
	583	# copy lpath inside rpath directory
	584	rpath2 = os.path.join(rpath, rootdir)
	585	else:
	586	# copy lpath as rpath directory
	587	rpath2 = rpath
	588	rpaths = [
	589	os.path.join(rpath2, path[len(lpath) :].lstrip("/")) for path in lpaths
	590	]
	591	else:
	592	lpaths = [lpath]
	593	rpaths = [rpath]
	594	for lpath, rpath in zip(lpaths, rpaths):
	595	with open(lpath, "rb") as f1:
	596	with self.open(rpath, "wb", **kwargs) as f2:
	597	data = True
	598	while data:
	599	data = f1.read(self.blocksize)
	600	f2.write(data)
	601
	602	def head(self, path, size=1024):
	603	""" Get the first ``size`` bytes from file """
	604	with self.open(path, "rb") as f:
	605	return f.read(size)
	606
	607	def tail(self, path, size=1024):
	608	""" Get the last ``size`` bytes from file """
	609	with self.open(path, "rb") as f:
	610	f.seek(max(-size, -f.size), 2)
	611	return f.read()
	612
	613	def copy(self, path1, path2, **kwargs):
	614	""" Copy within two locations in the filesystem"""
	615	raise NotImplementedError
	616
	617	def mv(self, path1, path2, **kwargs):
	618	""" Move file from one location to another """
	619	self.copy(path1, path2, **kwargs)
	620	self.rm(path1, recursive=False)
	621
	622	def _rm(self, path):
	623	"""Delete a file"""
	624	raise NotImplementedError
	625
	626	def rm(self, path, recursive=False, maxdepth=None):
	627	"""Delete files.
	628
	629	Parameters
	630	----------
	631	path: str or list of str
	632	File(s) to delete.
	633	recursive: bool
	634	If file(s) are directories, recursively delete contents and then
	635	also remove the directory
	636	maxdepth: int or None
	637	Depth to pass to walk for finding files to delete, if recursive.
	638	If None, there will be no limit and infinite recursion may be
	639	possible.
	640	"""
	641	# prefer some bulk method, if possible
	642	if not isinstance(path, list):
	643	path = [path]
	644	for p in path:
	645	if recursive:
	646	out = self.walk(p, maxdepth=maxdepth)
	647	for pa_, _, files in reversed(list(out)):
	648	for name in files:
	649	fn = "/".join([pa_, name]) if pa_ else name
	650	self.rm(fn)
	651	self.rmdir(pa_)
	652	else:
	653	self._rm(p)
	654
	655	@classmethod
	656	def _parent(cls, path):
	657	path = cls._strip_protocol(path.rstrip("/"))
	658	if "/" in path:
	659	return cls.root_marker + path.rsplit("/", 1)[0]
	660	else:
	661	return cls.root_marker
	662
	663	def _open(
	664	self,
	665	path,
	666	mode="rb",
	667	block_size=None,
	668	autocommit=True,
	669	cache_options=None,
	670	**kwargs
	671	):
	672	"""Return raw bytes-mode file-like from the file-system"""
	673	return AbstractBufferedFile(
	674	self,
	675	path,
	676	mode,
	677	block_size,
	678	autocommit,
	679	cache_options=cache_options,
	680	**kwargs
	681	)
	682
	683	def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
	684	"""
	685	Return a file-like object from the filesystem
	686
	687	The resultant instance must function correctly in a context ``with``
	688	block.
	689
	690	Parameters
	691	----------
	692	path: str
	693	Target file
	694	mode: str like 'rb', 'w'
	695	See builtin ``open()``
	696	block_size: int
	697	Some indication of buffering - this is a value in bytes
	698	cache_options : dict, optional
	699	Extra arguments to pass through to the cache.
	700	encoding, errors, newline: passed on to TextIOWrapper for text mode
	701	"""
	702	import io
	703
	704	path = self._strip_protocol(path)
	705	if "b" not in mode:
	706	mode = mode.replace("t", "") + "b"
	707
	708	text_kwargs = {
	709	k: kwargs.pop(k)
	710	for k in ["encoding", "errors", "newline"]
	711	if k in kwargs
	712	}
	713	return io.TextIOWrapper(
	714	self.open(path, mode, block_size, kwargs), text_kwargs
	715	)
	716	else:
	717	ac = kwargs.pop("autocommit", not self._intrans)
	718	f = self._open(
	719	path,
	720	mode=mode,
	721	block_size=block_size,
	722	autocommit=ac,
	723	cache_options=cache_options,
	724	**kwargs
	725	)
	726	if not ac:
	727	self.transaction.files.append(f)
	728	return f
	729
	730	def touch(self, path, truncate=True, **kwargs):
	731	""" Create empty file, or update timestamp
	732
	733	Parameters
	734	----------
	735	path: str
	736	file location
	737	truncate: bool
	738	If True, always set file size to 0; if False, update timestamp and
	739	leave file unchanged, if backend allows this
	740	"""
	741	if truncate or not self.exists(path):
	742	with self.open(path, "wb", **kwargs):
	743	pass
	744	else:
	745	raise NotImplementedError # update timestamp, if possible
	746
	747	def ukey(self, path):
	748	"""Hash of file properties, to tell if it has changed"""
	749	return md5(str(self.info(path)).encode()).hexdigest()
	750
	751	def read_block(self, fn, offset, length, delimiter=None):
	752	""" Read a block of bytes from
	753
	754	Starting at ``offset`` of the file, read ``length`` bytes. If
	755	``delimiter`` is set then we ensure that the read starts and stops at
	756	delimiter boundaries that follow the locations ``offset`` and ``offset
	757	+ length``. If ``offset`` is zero then we start at zero. The
	758	bytestring returned WILL include the end delimiter string.
	759
	760	If offset+length is beyond the eof, reads to eof.
	761
	762	Parameters
	763	----------
	764	fn: string
	765	Path to filename
	766	offset: int
	767	Byte offset to start read
	768	length: int
	769	Number of bytes to read
	770	delimiter: bytes (optional)
	771	Ensure reading starts and stops at delimiter bytestring
	772
	773	Examples
	774	--------
	775	>>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP
	776	b'Alice, 100\\nBo'
	777	>>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP
	778	b'Alice, 100\\nBob, 200\\n'
	779
	780	Use ``length=None`` to read to the end of the file.
	781	>>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP
	782	b'Alice, 100\\nBob, 200\\nCharlie, 300'
	783
	784	See Also
	785	--------
	786	utils.read_block
	787	"""
	788	with self.open(fn, "rb") as f:
	789	size = f.size
	790	if length is None:
	791	length = size
	792	if size is not None and offset + length > size:
	793	length = size - offset
	794	return read_block(f, offset, length, delimiter)
	795
	796	def __reduce__(self):
	797	return make_instance, (type(self), self.storage_args, self.storage_options)
	798
	799	def _get_pyarrow_filesystem(self):
	800	"""
	801	Make a version of the FS instance which will be acceptable to pyarrow
	802	"""
	803	# all instances already also derive from pyarrow
	804	return self
	805
	806	def get_mapper(self, root, check=False, create=False):
	807	"""Create key/value store based on this file-system
	808
	809	Makes a MutibleMapping interface to the FS at the given root path.
	810	See ``fsspec.mapping.FSMap`` for further details.
	811	"""
	812	from .mapping import FSMap
	813
	814	return FSMap(root, self, check, create)
	815
	816	@classmethod
	817	def clear_instance_cache(cls):
	818	"""
	819	Clear the cache of filesystem instances.
	820
	821	Notes
	822	-----
	823	Unless overridden by setting the ``cachable`` class attribute to False,
	824	the filesystem class stores a reference to newly created instances. This
	825	prevents Python's normal rules around garbage collection from working,
	826	since the instances refcount will not drop to zero until
	827	``clear_instance_cache`` is called.
	828	"""
	829	cls._cache.clear()
	830
	831	# ------------------------------------------------------------------------
	832	# Aliases
	833
	834	def makedir(self, path, create_parents=True, **kwargs):
	835	"""Alias of :ref:`FilesystemSpec.mkdir`."""
	836	return self.mkdir(path, create_parents=create_parents, **kwargs)
	837
	838	def mkdirs(self, path, exist_ok=False):
	839	"""Alias of :ref:`FilesystemSpec.makedirs`."""
	840	return self.makedirs(path, exist_ok=exist_ok)
	841
	842	def listdir(self, path, detail=True, **kwargs):
	843	"""Alias of :ref:`FilesystemSpec.ls`."""
	844	return self.ls(path, detail=detail, **kwargs)
	845
	846	def cp(self, path1, path2, **kwargs):
	847	"""Alias of :ref:`FilesystemSpec.copy`."""
	848	return self.copy(path1, path2, **kwargs)
	849
	850	def move(self, path1, path2, **kwargs):
	851	"""Alias of :ref:`FilesystemSpec.mv`."""
	852	return self.mv(path1, path2, **kwargs)
	853
	854	def stat(self, path, **kwargs):
	855	"""Alias of :ref:`FilesystemSpec.info`."""
	856	return self.info(path, **kwargs)
	857
	858	def disk_usage(self, path, total=True, maxdepth=None, **kwargs):
	859	"""Alias of :ref:`FilesystemSpec.du`."""
	860	return self.du(path, total=total, maxdepth=maxdepth, **kwargs)
	861
	862	def rename(self, path1, path2, **kwargs):
	863	"""Alias of :ref:`FilesystemSpec.mv`."""
	864	return self.mv(path1, path2, **kwargs)
	865
	866	def delete(self, path, recursive=False, maxdepth=None):
	867	"""Alias of :ref:`FilesystemSpec.rm`."""
	868	return self.rm(path, recursive=recursive, maxdepth=maxdepth)
	869
	870	def upload(self, lpath, rpath, recursive=False, **kwargs):
	871	"""Alias of :ref:`FilesystemSpec.put`."""
	872	return self.put(lpath, rpath, recursive=recursive, **kwargs)
	873
	874	def download(self, rpath, lpath, recursive=False, **kwargs):
	875	"""Alias of :ref:`FilesystemSpec.get`."""
	876	return self.get(rpath, lpath, recursive=recursive, **kwargs)
	877
	878
	879	class AbstractBufferedFile(io.IOBase):
	880	"""Convenient class to derive from to provide buffering
	881
	882	In the case that the backend does not provide a pythonic file-like object
	883	already, this class contains much of the logic to build one. The only
	884	methods that need to be overridden are ``_upload_chunk``,
	885	``_initate_upload`` and ``_fetch_range``.
	886	"""
	887
	888	DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
	889
	890	def __init__(
	891	self,
	892	fs,
	893	path,
	894	mode="rb",
	895	block_size="default",
	896	autocommit=True,
	897	cache_type="readahead",
	898	cache_options=None,
	899	**kwargs
	900	):
	901	"""
	902	Template for files with buffered reading and writing
	903
	904	Parameters
	905	----------
	906	fs: instance of FileSystem
	907	path: str
	908	location in file-system
	909	mode: str
	910	Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file
	911	systems may be read-only, and some may not support append.
	912	block_size: int
	913	Buffer size for reading or writing, 'default' for class default
	914	autocommit: bool
	915	Whether to write to final destination; may only impact what
	916	happens when file is being closed.
	917	cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead"
	918	Caching policy in read mode. See the definitions in ``core``.
	919	cache_options : dict
	920	Additional options passed to the constructor for the cache specified
	921	by `cache_type`.
	922	kwargs:
	923	Gets stored as self.kwargs
	924	"""
	925	from .core import caches
	926
	927	self.path = path
	928	self.fs = fs
	929	self.mode = mode
	930	self.blocksize = (
	931	self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size
	932	)
	933	self.loc = 0
	934	self.autocommit = autocommit
	935	self.end = None
	936	self.start = None
	937	self.closed = False
	938
	939	if cache_options is None:
	940	cache_options = {}
	941
	942	if "trim" in kwargs:
	943	warnings.warn(
	944	"Passing 'trim' to control the cache behavior has been deprecated. "
	945	"Specify it within the 'cache_options' argument instead.",
	946	FutureWarning,
	947	)
	948	cache_options["trim"] = kwargs.pop("trim")
	949
	950	self.kwargs = kwargs
	951
	952	if mode not in {"ab", "rb", "wb"}:
	953	raise NotImplementedError("File mode not supported")
	954	if mode == "rb":
	955	if not hasattr(self, "details"):
	956	self.details = fs.info(path)
	957	self.size = self.details["size"]
	958	self.cache = caches[cache_type](
	959	self.blocksize, self._fetch_range, self.size, **cache_options
	960	)
	961	else:
	962	self.buffer = io.BytesIO()
	963	self.offset = None
	964	self.forced = False
	965	self.location = None
	966
	967	@property
	968	def closed(self):
	969	# get around this attr being read-only in IOBase
	970	return self._closed
	971
	972	@closed.setter
	973	def closed(self, c):
	974	self._closed = c
	975
	976	def __hash__(self):
	977	if "w" in self.mode:
	978	return id(self)
	979	else:
	980	return int(tokenize(self.details), 16)
	981
	982	def __eq__(self, other):
	983	"""Files are equal if they have the same checksum, only in read mode"""
	984	return self.mode == "rb" and other.mode == "rb" and hash(self) == hash(other)
	985
	986	def commit(self):
	987	"""Move from temp to final destination"""
	988
	989	def discard(self):
	990	"""Throw away temporary file"""
	991
	992	def info(self):
	993	""" File information about this path """
	994	if "r" in self.mode:
	995	return self.details
	996	else:
	997	raise ValueError("Info not available while writing")
	998
	999	def tell(self):
	1000	""" Current file location """
	1001	return self.loc
	1002
	1003	def seek(self, loc, whence=0):
	1004	""" Set current file location
	1005
	1006	Parameters
	1007	----------
	1008	loc: int
	1009	byte location
	1010	whence: {0, 1, 2}
	1011	from start of file, current location or end of file, resp.
	1012	"""
	1013	loc = int(loc)
	1014	if not self.mode == "rb":
	1015	raise ValueError("Seek only available in read mode")
	1016	if whence == 0:
	1017	nloc = loc
	1018	elif whence == 1:
	1019	nloc = self.loc + loc
	1020	elif whence == 2:
	1021	nloc = self.size + loc
	1022	else:
	1023	raise ValueError("invalid whence (%s, should be 0, 1 or 2)" % whence)
	1024	if nloc < 0:
	1025	raise ValueError("Seek before start of file")
	1026	self.loc = nloc
	1027	return self.loc
	1028
	1029	def write(self, data):
	1030	"""
	1031	Write data to buffer.
	1032
	1033	Buffer only sent on flush() or if buffer is greater than
	1034	or equal to blocksize.
	1035
	1036	Parameters
	1037	----------
	1038	data: bytes
	1039	Set of bytes to be written.
	1040	"""
	1041	if self.mode not in {"wb", "ab"}:
	1042	raise ValueError("File not in write mode")
	1043	if self.closed:
	1044	raise ValueError("I/O operation on closed file.")
	1045	if self.forced:
	1046	raise ValueError("This file has been force-flushed, can only close")
	1047	out = self.buffer.write(data)
	1048	self.loc += out
	1049	if self.buffer.tell() >= self.blocksize:
	1050	self.flush()
	1051	return out
	1052
	1053	def flush(self, force=False):
	1054	"""
	1055	Write buffered data to backend store.
	1056
	1057	Writes the current buffer, if it is larger than the block-size, or if
	1058	the file is being closed.
	1059
	1060	Parameters
	1061	----------
	1062	force: bool
	1063	When closing, write the last block even if it is smaller than
	1064	blocks are allowed to be. Disallows further writing to this file.
	1065	"""
	1066
	1067	if self.closed:
	1068	raise ValueError("Flush on closed file")
	1069	if force and self.forced:
	1070	raise ValueError("Force flush cannot be called more than once")
	1071	if force:
	1072	self.forced = True
	1073
	1074	if self.mode not in {"wb", "ab"}:
	1075	# no-op to flush on read-mode
	1076	return
	1077
	1078	if not force and self.buffer.tell() < self.blocksize:
	1079	# Defer write on small block
	1080	return
	1081
	1082	if self.offset is None:
	1083	# Initialize a multipart upload
	1084	self.offset = 0
	1085	self._initiate_upload()
	1086
	1087	if self._upload_chunk(final=force) is not False:
	1088	self.offset += self.buffer.seek(0, 2)
	1089	self.buffer = io.BytesIO()
	1090
	1091	def _upload_chunk(self, final=False):
	1092	""" Write one part of a multi-block file upload
	1093
	1094	Parameters
	1095	==========
	1096	final: bool
	1097	This is the last block, so should complete file, if
	1098	self.autocommit is True.
	1099	"""
	1100	# may not yet have been initialized, may neet to call _initialize_upload
	1101
	1102	def _initiate_upload(self):
	1103	""" Create remote file/upload """
	1104	pass
	1105
	1106	def _fetch_range(self, start, end):
	1107	"""Get the specified set of bytes from remote"""
	1108	raise NotImplementedError
	1109
	1110	def read(self, length=-1):
	1111	"""
	1112	Return data from cache, or fetch pieces as necessary
	1113
	1114	Parameters
	1115	----------
	1116	length: int (-1)
	1117	Number of bytes to read; if <0, all remaining bytes.
	1118	"""
	1119	length = -1 if length is None else int(length)
	1120	if self.mode != "rb":
	1121	raise ValueError("File not in read mode")
	1122	if length < 0:
	1123	length = self.size - self.loc
	1124	if self.closed:
	1125	raise ValueError("I/O operation on closed file.")
	1126	logger.debug("%s read: %i - %i" % (self, self.loc, self.loc + length))
	1127	if length == 0:
	1128	# don't even bother calling fetch
	1129	return b""
	1130	out = self.cache._fetch(self.loc, self.loc + length)
	1131	self.loc += len(out)
	1132	return out
	1133
	1134	def readinto(self, b):
	1135	"""mirrors builtin file's readinto method
	1136
	1137	https://docs.python.org/3/library/io.html#io.RawIOBase.readinto
	1138	"""
	1139	data = self.read(len(b))
	1140	b[: len(data)] = data
	1141	return len(data)
	1142
	1143	def readuntil(self, char=b"\n", blocks=None):
	1144	"""Return data between current position and first occurrence of char
	1145
	1146	char is included in the output, except if the end of the tile is
	1147	encountered first.
	1148
	1149	Parameters
	1150	----------
	1151	char: bytes
	1152	Thing to find
	1153	blocks: None or int
	1154	How much to read in each go. Defaults to file blocksize - which may
	1155	mean a new read on every call.
	1156	"""
	1157	out = []
	1158	while True:
	1159	start = self.tell()
	1160	part = self.read(blocks or self.blocksize)
	1161	if len(part) == 0:
	1162	break
	1163	found = part.find(char)
	1164	if found > -1:
	1165	out.append(part[: found + len(char)])
	1166	self.seek(start + found + len(char))
	1167	break
	1168	out.append(part)
	1169	return b"".join(out)
	1170
	1171	def readline(self):
	1172	"""Read until first occurrence of newline character
	1173
	1174	Note that, because of character encoding, this is not necessarily a
	1175	true line ending.
	1176	"""
	1177	return self.readuntil(b"\n")
	1178
	1179	def __next__(self):
	1180	out = self.readline()
	1181	if out:
	1182	return out
	1183	raise StopIteration
	1184
	1185	def __iter__(self):
	1186	return self
	1187
	1188	def readlines(self):
	1189	"""Return all data, split by the newline character"""
	1190	data = self.read()
	1191	lines = data.split(b"\n")
	1192	out = [l + b"\n" for l in lines[:-1]]
	1193	if data.endswith(b"\n"):
	1194	return out
	1195	else:
	1196	return out + [lines[-1]]
	1197	# return list(self) ???
	1198
	1199	def readinto1(self, b):
	1200	return self.readinto(b)
	1201
	1202	def close(self):
	1203	""" Close file
	1204
	1205	Finalizes writes, discards cache
	1206	"""
	1207	if self.closed:
	1208	return
	1209	if self.mode == "rb":
	1210	self.cache = None
	1211	else:
	1212	if not self.forced:
	1213	self.flush(force=True)
	1214
	1215	if self.fs is not None:
	1216	self.fs.invalidate_cache(self.path)
	1217	self.fs.invalidate_cache(self.fs._parent(self.path))
	1218
	1219	self.closed = True
	1220
	1221	def readable(self):
	1222	"""Whether opened for reading"""
	1223	return self.mode == "rb" and not self.closed
	1224
	1225	def seekable(self):
	1226	"""Whether is seekable (only in read mode)"""
	1227	return self.readable()
	1228
	1229	def writable(self):
	1230	"""Whether opened for writing"""
	1231	return self.mode in {"wb", "ab"} and not self.closed
	1232
	1233	def __del__(self):
	1234	self.close()
	1235
	1236	def __str__(self):
	1237	return "<File-like object %s, %s>" % (type(self.fs).__name__, self.path)
	1238
	1239	__repr__ = __str__
	1240
	1241	def __enter__(self):
	1242	return self
	1243
	1244	def __exit__(self, *args):
	1245	self.close()

-0

fsspec/tests/__init__.py less more

(New empty file)

+113

-0

fsspec/tests/test_api.py less more

	0	"""Tests the spec, using memoryfs"""
	1
	2	import os
	3	import pickle
	4	from fsspec.implementations.memory import MemoryFileSystem, MemoryFile
	5
	6
	7	def test_idempotent():
	8	MemoryFileSystem.clear_instance_cache()
	9	fs = MemoryFileSystem()
	10	fs2 = MemoryFileSystem()
	11	assert fs is fs2
	12	assert MemoryFileSystem.current() is fs2
	13
	14	MemoryFileSystem.clear_instance_cache()
	15	assert not MemoryFileSystem._cache
	16
	17	fs2 = MemoryFileSystem().current()
	18	assert fs == fs2
	19
	20
	21	def test_pickle():
	22	fs = MemoryFileSystem()
	23	fs2 = pickle.loads(pickle.dumps(fs))
	24	assert fs == fs2
	25
	26
	27	def test_class_methods():
	28	assert MemoryFileSystem._strip_protocol("memory:stuff") == "stuff"
	29	assert MemoryFileSystem._strip_protocol("memory://stuff") == "stuff"
	30	assert MemoryFileSystem._strip_protocol("stuff") == "stuff"
	31	assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff"
	32
	33	assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {}
	34
	35
	36	def test_get_put(tmpdir):
	37	tmpdir = str(tmpdir)
	38	fn = os.path.join(tmpdir, "one")
	39	open(fn, "wb").write(b"one")
	40	os.mkdir(os.path.join(tmpdir, "dir"))
	41	fn2 = os.path.join(tmpdir, "dir", "two")
	42	open(fn2, "wb").write(b"two")
	43
	44	fs = MemoryFileSystem()
	45	fs.put(fn, "/afile")
	46	assert fs.cat("/afile") == b"one"
	47
	48	fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data")
	49	fn3 = os.path.join(tmpdir, "three")
	50	fs.get("/bfile", fn3)
	51	assert open(fn3, "rb").read() == b"data"
	52
	53	fs.put(tmpdir, "/more", recursive=True)
	54	assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"]
	55
	56	for f in [fn, fn2, fn3]:
	57	os.remove(f)
	58	os.rmdir(os.path.join(tmpdir, "dir"))
	59
	60	fs.get("/more/", tmpdir + "/", recursive=True)
	61	assert open(fn3, "rb").read() == b"data"
	62	assert open(fn, "rb").read() == b"one"
	63
	64
	65	def test_du():
	66	fs = MemoryFileSystem()
	67	fs.store = {
	68	"/dir/afile": MemoryFile(fs, "/afile", b"a"),
	69	"/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"),
	70	"/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"),
	71	}
	72	assert fs.du("/dir") == 6
	73	assert fs.du("/dir", total=False)["/dir/dirb/afile"] == 2
	74	assert fs.du("/dir", maxdepth=0) == 1
	75
	76
	77	def test_head_tail():
	78	fs = MemoryFileSystem()
	79	with fs.open("/myfile", "wb") as f:
	80	f.write(b"I had a nice big cabbage")
	81	assert fs.head("/myfile", 5) == b"I had"
	82	assert fs.tail("/myfile", 7) == b"cabbage"
	83
	84
	85	def test_move():
	86	fs = MemoryFileSystem()
	87	with fs.open("/myfile", "wb") as f:
	88	f.write(b"I had a nice big cabbage")
	89	fs.move("/myfile", "/otherfile")
	90	assert not fs.exists("/myfile")
	91	assert fs.info("/otherfile")
	92	assert isinstance(fs.ukey("/otherfile"), str)
	93
	94
	95	def test_read_block_delimiter():
	96	fs = MemoryFileSystem()
	97	with fs.open("/myfile", "wb") as f:
	98	f.write(b"some\n" b"lines\n" b"of\n" b"text")
	99	assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n"
	100	assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n"
	101	assert fs.read_block("/myfile", 6, 2, b"\n") == b""
	102	assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n"
	103	assert fs.read_block("/myfile", 12, 6, b"\n") == b"text"
	104	assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile")
	105
	106
	107	def test_open_text():
	108	fs = MemoryFileSystem()
	109	with fs.open("/myfile", "wb") as f:
	110	f.write(b"some\n" b"lines\n" b"of\n" b"text")
	111	f = fs.open("/myfile", "r", encoding="latin1")
	112	assert f.encoding == "latin1"

+148

-0

fsspec/tests/test_compression.py less more

	0	import pathlib
	1
	2	import pytest
	3
	4	import fsspec.core
	5	from fsspec.compression import compr, register_compression
	6	from fsspec.utils import compressions, infer_compression
	7
	8
	9	def test_infer_custom_compression():
	10	"""Inferred compression gets values from fsspec.compression.compr."""
	11	assert infer_compression("fn.zip") == "zip"
	12	assert infer_compression("fn.gz") == "gzip"
	13	assert infer_compression("fn.unknown") is None
	14	assert infer_compression("fn.test_custom") is None
	15	assert infer_compression("fn.tst") is None
	16
	17	register_compression("test_custom", lambda f, **kwargs: f, "tst")
	18
	19	try:
	20	assert infer_compression("fn.zip") == "zip"
	21	assert infer_compression("fn.gz") == "gzip"
	22	assert infer_compression("fn.unknown") is None
	23	assert infer_compression("fn.test_custom") is None
	24	assert infer_compression("fn.tst") == "test_custom"
	25
	26	# Duplicate registration in name or extension raises a value error.
	27	with pytest.raises(ValueError):
	28	register_compression("test_custom", lambda f, **kwargs: f, "tst")
	29
	30	with pytest.raises(ValueError):
	31	register_compression("test_conflicting", lambda f, **kwargs: f, "tst")
	32	assert "test_conflicting" not in compr
	33
	34	# ...but can be forced.
	35	register_compression(
	36	"test_conflicting", lambda f, **kwargs: f, "tst", force=True
	37	)
	38	assert infer_compression("fn.zip") == "zip"
	39	assert infer_compression("fn.gz") == "gzip"
	40	assert infer_compression("fn.unknown") is None
	41	assert infer_compression("fn.test_custom") is None
	42	assert infer_compression("fn.tst") == "test_conflicting"
	43
	44	finally:
	45	del compr["test_custom"]
	46	del compr["test_conflicting"]
	47	del compressions["tst"]
	48
	49
	50	def test_lzma_compression_name():
	51	pytest.importorskip("lzma")
	52	assert infer_compression("fn.xz") == "xz"
	53
	54
	55	def test_lz4_compression(tmpdir):
	56	"""Infer lz4 compression for .lz4 files if lz4 is available."""
	57	tmp_path = pathlib.Path(str(tmpdir))
	58
	59	lz4 = pytest.importorskip("lz4")
	60
	61	tmp_path.mkdir(exist_ok=True)
	62
	63	tdat = "foobar" * 100
	64
	65	with fsspec.core.open(
	66	str(tmp_path / "out.lz4"), mode="wt", compression="infer"
	67	) as outfile:
	68	outfile.write(tdat)
	69
	70	compressed = (tmp_path / "out.lz4").open("rb").read()
	71	assert lz4.frame.decompress(compressed).decode() == tdat
	72
	73	with fsspec.core.open(
	74	str(tmp_path / "out.lz4"), mode="rt", compression="infer"
	75	) as infile:
	76	assert infile.read() == tdat
	77
	78	with fsspec.core.open(
	79	str(tmp_path / "out.lz4"), mode="rt", compression="lz4"
	80	) as infile:
	81	assert infile.read() == tdat
	82
	83
	84	def test_zstd_compression(tmpdir):
	85	"""Infer zstd compression for .zst files if zstandard is available."""
	86	tmp_path = pathlib.Path(str(tmpdir))
	87
	88	zstd = pytest.importorskip("zstandard")
	89
	90	tmp_path.mkdir(exist_ok=True)
	91
	92	tdat = "foobar" * 100
	93
	94	with fsspec.core.open(
	95	str(tmp_path / "out.zst"), mode="wt", compression="infer"
	96	) as outfile:
	97	outfile.write(tdat)
	98
	99	compressed = (tmp_path / "out.zst").open("rb").read()
	100	assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat
	101
	102	with fsspec.core.open(
	103	str(tmp_path / "out.zst"), mode="rt", compression="infer"
	104	) as infile:
	105	assert infile.read() == tdat
	106
	107	with fsspec.core.open(
	108	str(tmp_path / "out.zst"), mode="rt", compression="zstd"
	109	) as infile:
	110	assert infile.read() == tdat
	111
	112
	113	def test_snappy_compression(tmpdir):
	114	"""No registered compression for snappy, but can be specified."""
	115	tmp_path = pathlib.Path(str(tmpdir))
	116
	117	snappy = pytest.importorskip("snappy")
	118
	119	tmp_path.mkdir(exist_ok=True)
	120
	121	tdat = "foobar" * 100
	122
	123	# Snappy isn't inferred.
	124	with fsspec.core.open(
	125	str(tmp_path / "out.snappy"), mode="wt", compression="infer"
	126	) as outfile:
	127	outfile.write(tdat)
	128	assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat
	129
	130	# but can be specified.
	131	with fsspec.core.open(
	132	str(tmp_path / "out.snappy"), mode="wt", compression="snappy"
	133	) as outfile:
	134	outfile.write(tdat)
	135
	136	compressed = (tmp_path / "out.snappy").open("rb").read()
	137	assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat
	138
	139	with fsspec.core.open(
	140	str(tmp_path / "out.snappy"), mode="rb", compression="infer"
	141	) as infile:
	142	assert infile.read() == compressed
	143
	144	with fsspec.core.open(
	145	str(tmp_path / "out.snappy"), mode="rt", compression="snappy"
	146	) as infile:
	147	assert infile.read() == tdat

+142

-0

fsspec/tests/test_core.py less more

	0	import pytest
	1	import pickle
	2	import string
	3
	4	from fsspec.core import (
	5	_expand_paths,
	6	OpenFile,
	7	caches,
	8	get_compression,
	9	BaseCache,
	10	BlockCache,
	11	)
	12
	13
	14	@pytest.mark.parametrize(
	15	"path, name_function, num, out",
	16	[
	17	[["apath"], None, 1, ["apath"]],
	18	["apath.*.csv", None, 1, ["apath.0.csv"]],
	19	["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]],
	20	["a*", lambda x: "abc"[x], 2, ["aa", "ab"]],
	21	],
	22	)
	23	def test_expand_paths(path, name_function, num, out):
	24	assert _expand_paths(path, name_function, num) == out
	25
	26
	27	def test_expand_error():
	28	with pytest.raises(ValueError):
	29	_expand_paths(".", None, 1)
	30
	31
	32	def test_openfile_api(m):
	33	m.open("somepath", "wb").write(b"data")
	34	of = OpenFile(m, "somepath")
	35	assert str(of) == "<OpenFile 'somepath'>"
	36	f = of.open()
	37	assert f.read() == b"data"
	38	f.close()
	39	with OpenFile(m, "somepath", mode="rt") as f:
	40	f.read() == "data"
	41
	42
	43	# For test_cache_pickleable(). Functions are only picklable if they are defined
	44	# at the top-level of a module
	45	def _fetcher(start, end):
	46	return b"0" * (end - start)
	47
	48
	49	def letters_fetcher(start, end):
	50	return string.ascii_letters[start:end].encode()
	51
	52
	53	@pytest.fixture(params=caches.values(), ids=list(caches.keys()))
	54	def Cache_imp(request):
	55	return request.param
	56
	57
	58	def test_cache_empty_file(Cache_imp):
	59	blocksize = 5
	60	size = 0
	61	cache = Cache_imp(blocksize, _fetcher, size)
	62	assert cache._fetch(0, 0) == b""
	63
	64
	65	def test_cache_pickleable(Cache_imp):
	66	blocksize = 5
	67	size = 100
	68	cache = Cache_imp(blocksize, _fetcher, size)
	69	cache._fetch(0, 5) # fill in cache
	70	unpickled = pickle.loads(pickle.dumps(cache))
	71	assert isinstance(unpickled, Cache_imp)
	72	assert unpickled.blocksize == blocksize
	73	assert unpickled.size == size
	74	assert unpickled._fetch(0, 10) == b"0" * 10
	75
	76
	77	@pytest.mark.parametrize(
	78	"size_requests",
	79	[[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]],
	80	)
	81	@pytest.mark.parametrize("blocksize", [1, 10, 52, 100])
	82	def test_cache_basic(Cache_imp, blocksize, size_requests):
	83	cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters))
	84
	85	for start, end in size_requests:
	86	result = cache[start:end]
	87	expected = string.ascii_letters[start:end].encode()
	88	assert result == expected
	89
	90
	91	def test_xz_lzma_compressions():
	92	pytest.importorskip("lzma")
	93	# Ensure that both 'xz' and 'lzma' compression names can be parsed
	94	assert get_compression("some_file.xz", "infer") == "xz"
	95	assert get_compression("some_file.xz", "xz") == "xz"
	96	assert get_compression("some_file.xz", "lzma") == "lzma"
	97
	98
	99	def test_cache_getitem(Cache_imp):
	100	cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters))
	101	assert cacher[0:4] == b"abcd"
	102	assert cacher[:4] == b"abcd"
	103	assert cacher[-3:] == b"XYZ"
	104	assert cacher[-3:-1] == b"XY"
	105	assert cacher[2:4] == b"cd"
	106
	107
	108	def test_cache_getitem_raises():
	109	cacher = BaseCache(4, letters_fetcher, len(string.ascii_letters))
	110	with pytest.raises(TypeError, match="int"):
	111	cacher[5]
	112
	113	with pytest.raises(ValueError, match="contiguous"):
	114	cacher[::4]
	115
	116
	117	def test_block_cache_lru():
	118	cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2)
	119	# miss
	120	cache[0:2]
	121	assert cache.cache_info().hits == 0
	122	assert cache.cache_info().misses == 1
	123	assert cache.cache_info().currsize == 1
	124
	125	# hit
	126	cache[0:2]
	127	assert cache.cache_info().hits == 1
	128	assert cache.cache_info().misses == 1
	129	assert cache.cache_info().currsize == 1
	130
	131	# miss
	132	cache[4:6]
	133	assert cache.cache_info().hits == 1
	134	assert cache.cache_info().misses == 2
	135	assert cache.cache_info().currsize == 2
	136
	137	# miss & evict
	138	cache[12:13]
	139	assert cache.cache_info().hits == 1
	140	assert cache.cache_info().misses == 3
	141	assert cache.cache_info().currsize == 2

+177

-0

fsspec/tests/test_file.py less more

	0	"""Tests abstract buffered file API, using FTP implementation"""
	1	import pickle
	2	import sys
	3	import pytest
	4	from fsspec.implementations.tests.test_ftp import FTPFileSystem
	5
	6	data = b"hello" * 10000
	7
	8
	9	@pytest.mark.xfail(
	10	sys.version_info < (3, 6),
	11	reason="py35 error, see https://github.com/intake/filesystem_spec/issues/147",
	12	)
	13	def test_pickle(ftp_writable):
	14	host, port, user, pw = ftp_writable
	15	ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
	16
	17	f = ftp.open("/out", "rb")
	18
	19	f2 = pickle.loads(pickle.dumps(f))
	20	assert f == f2
	21
	22
	23	def test_file_read_attributes(ftp_writable):
	24	host, port, user, pw = ftp_writable
	25	ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
	26
	27	f = ftp.open("/out", "rb")
	28	assert f.info()["size"] == len(data)
	29	assert f.tell() == 0
	30	assert f.seekable()
	31	assert f.readable()
	32	assert not f.writable()
	33	out = bytearray(len(data))
	34
	35	assert f.read() == data
	36	assert f.read() == b""
	37	f.seek(0)
	38	assert f.readuntil(b"l") == b"hel"
	39	assert f.tell() == 3
	40
	41	f.readinto1(out)
	42	assert out[:-3] == data[3:]
	43	with pytest.raises(ValueError):
	44	f.write(b"")
	45	f.close()
	46	with pytest.raises(ValueError):
	47	f.read()(b"")
	48
	49
	50	def test_seek(ftp_writable):
	51	host, port, user, pw = ftp_writable
	52	ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
	53
	54	f = ftp.open("/out", "rb")
	55
	56	assert f.seek(-10, 2) == len(data) - 10
	57	assert f.tell() == len(data) - 10
	58	assert f.seek(-1, 1) == len(data) - 11
	59	with pytest.raises(ValueError):
	60	f.seek(-1)
	61	with pytest.raises(ValueError):
	62	f.seek(0, 7)
	63
	64
	65	def test_file_idempotent(ftp_writable):
	66	host, port, user, pw = ftp_writable
	67	ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
	68
	69	f = ftp.open("/out", "rb")
	70	f2 = ftp.open("/out", "rb")
	71	assert hash(f) == hash(f2)
	72	assert f == f2
	73	ftp.touch("/out2")
	74	f2 = ftp.open("/out2", "rb")
	75	assert hash(f2) != hash(f)
	76	assert f != f2
	77	f2 = ftp.open("/out", "wb")
	78	assert hash(f2) != hash(f)
	79
	80
	81	def test_file_text_attributes(ftp_writable):
	82	host, port, user, pw = ftp_writable
	83	ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
	84
	85	data = b"hello\n" * 1000
	86	with ftp.open("/out2", "wb") as f:
	87	f.write(data)
	88
	89	f = ftp.open("/out2", "rb")
	90	assert f.readline() == b"hello\n"
	91	f.seek(0)
	92	assert list(f) == [d + b"\n" for d in data.split()]
	93	f.seek(0)
	94	assert f.readlines() == [d + b"\n" for d in data.split()]
	95
	96	f = ftp.open("/out2", "rt")
	97	assert f.readline() == "hello\n"
	98	assert f.encoding
	99
	100
	101	def test_file_write_attributes(ftp_writable):
	102	host, port, user, pw = ftp_writable
	103	ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
	104	f = ftp.open("/out2", "wb")
	105	with pytest.raises(ValueError):
	106	f.info()
	107	with pytest.raises(ValueError):
	108	f.seek(0)
	109	with pytest.raises(ValueError):
	110	f.read(0)
	111	assert not f.readable()
	112	assert f.writable()
	113
	114	f.flush() # no-op
	115
	116	assert f.write(b"hello") == 5
	117	assert f.write(b"hello") == 5
	118	assert not f.closed
	119	f.close()
	120	assert f.closed
	121	with pytest.raises(ValueError):
	122	f.write(b"")
	123	with pytest.raises(ValueError):
	124	f.flush()
	125
	126
	127	def test_midread_cache(ftp_writable):
	128	host, port, user, pw = ftp_writable
	129	fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
	130	fn = "/myfile"
	131	with fs.open(fn, "wb") as f:
	132	f.write(b"a" * 175627146)
	133	with fs.open(fn, "rb") as f:
	134	f.seek(175561610)
	135	d1 = f.read(65536)
	136	assert len(d1) == 65536
	137
	138	f.seek(4)
	139	size = 17562198
	140	d2 = f.read(size)
	141	assert len(d2) == size
	142
	143	f.seek(17562288)
	144	size = 17562187
	145	d3 = f.read(size)
	146	assert len(d3) == size
	147
	148
	149	def test_read_block(ftp_writable):
	150	# not the same as test_read_block in test_utils, this depends on the
	151	# behaviour of the bytest caching
	152	from fsspec.utils import read_block
	153
	154	host, port, user, pw = ftp_writable
	155	fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
	156	fn = "/myfile"
	157	with fs.open(fn, "wb") as f:
	158	f.write(b"a,b\n1,2")
	159	f = fs.open(fn, "rb", cache_type="bytes")
	160	assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2"
	161
	162
	163	def test_with_gzip(ftp_writable):
	164	import gzip
	165
	166	data = b"some compressable stuff"
	167	host, port, user, pw = ftp_writable
	168	fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
	169	fn = "/myfile"
	170	with fs.open(fn, "wb") as f:
	171	gf = gzip.GzipFile(fileobj=f, mode="w")
	172	gf.write(data)
	173	gf.close()
	174	with fs.open(fn, "rb") as f:
	175	gf = gzip.GzipFile(fileobj=f, mode="r")
	176	assert gf.read() == data

+60

-0

fsspec/tests/test_fuse.py less more

	0	import os
	1	import signal
	2	import time
	3	from multiprocessing import Process
	4
	5	import pytest
	6
	7	pytest.importorskip("fuse") # noqa: E402
	8
	9	from fsspec.fuse import run
	10	from fsspec.implementations.memory import MemoryFileSystem
	11
	12
	13	def host_fuse(mountdir):
	14	fs = MemoryFileSystem()
	15	fs.touch("/mounted/testfile")
	16	run(fs, "/mounted/", mountdir)
	17
	18
	19	def test_basic(tmpdir):
	20	mountdir = str(tmpdir.mkdir("mount"))
	21
	22	fuse_process = Process(target=host_fuse, args=(str(mountdir),))
	23	fuse_process.start()
	24
	25	try:
	26	timeout = 10
	27	while True:
	28	try:
	29	# can fail with device not ready while waiting for fuse
	30	if "testfile" in os.listdir(mountdir):
	31	break
	32	except Exception:
	33	pass
	34	timeout -= 1
	35	time.sleep(1)
	36	assert timeout > 0, "Timeout"
	37
	38	fn = os.path.join(mountdir, "test")
	39	with open(fn, "wb") as f:
	40	f.write(b"data")
	41
	42	with open(fn) as f:
	43	assert f.read() == "data"
	44
	45	os.remove(fn)
	46
	47	os.mkdir(fn)
	48	assert os.listdir(fn) == []
	49
	50	os.mkdir(fn + "/inner")
	51
	52	with pytest.raises(OSError):
	53	os.rmdir(fn)
	54
	55	os.rmdir(fn + "/inner")
	56	os.rmdir(fn)
	57	finally:
	58	os.kill(fuse_process.pid, signal.SIGTERM)
	59	fuse_process.join()

+61

-0

fsspec/tests/test_mapping.py less more

	0	import os
	1	import fsspec
	2	from fsspec.implementations.memory import MemoryFileSystem
	3	import pickle
	4	import pytest
	5
	6
	7	def test_mapping_prefix(tmpdir):
	8	tmpdir = str(tmpdir)
	9	os.makedirs(os.path.join(tmpdir, "afolder"))
	10	open(os.path.join(tmpdir, "afile"), "w").write("test")
	11	open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
	12
	13	m = fsspec.get_mapper("file://" + tmpdir)
	14	assert "afile" in m
	15	assert m["afolder/anotherfile"] == b"test2"
	16
	17	fs = fsspec.filesystem("file")
	18	m2 = fs.get_mapper(tmpdir)
	19	m3 = fs.get_mapper("file://" + tmpdir)
	20
	21	assert m == m2 == m3
	22
	23
	24	def test_ops():
	25	MemoryFileSystem.store.clear()
	26	m = fsspec.get_mapper("memory://")
	27	assert not m
	28	assert list(m) == []
	29
	30	with pytest.raises(KeyError):
	31	m["hi"]
	32
	33	assert m.pop("key", 0) == 0
	34
	35	m["key0"] = b"data"
	36	assert list(m) == ["key0"]
	37	assert m["key0"] == b"data"
	38
	39	m.clear()
	40
	41	assert list(m) == []
	42
	43
	44	def test_pickle():
	45	m = fsspec.get_mapper("memory://")
	46	assert isinstance(m.fs, MemoryFileSystem)
	47	m["key"] = b"data"
	48	m2 = pickle.loads(pickle.dumps(m))
	49	assert list(m) == list(m2)
	50
	51
	52	def test_keys_view():
	53	# https://github.com/intake/filesystem_spec/issues/186
	54	m = fsspec.get_mapper("memory://")
	55	m["key"] = b"data"
	56
	57	keys = m.keys()
	58	assert len(keys) == 1
	59	# check that we don't consume the keys
	60	assert len(keys) == 1

+18

-0

fsspec/tests/test_registry.py less more

	0	import pytest
	1	from fsspec.registry import get_filesystem_class, registry
	2
	3
	4	@pytest.mark.parametrize(
	5	"protocol,module,minversion,oldversion",
	6	[("s3", "s3fs", "0.3.0", "0.1.0"), ("gs", "gcsfs", "0.3.0", "0.1.0")],
	7	)
	8	def test_minversion_s3fs(protocol, module, minversion, oldversion, monkeypatch):
	9	registry.clear()
	10	mod = pytest.importorskip(module, minversion)
	11
	12	assert get_filesystem_class("s3") is not None
	13	registry.clear()
	14
	15	monkeypatch.setattr(mod, "__version__", oldversion)
	16	with pytest.raises(RuntimeError, match=minversion):
	17	get_filesystem_class(protocol)

+140

-0

fsspec/tests/test_spec.py less more

	0	import pytest
	1	from fsspec.spec import AbstractFileSystem, AbstractBufferedFile
	2
	3
	4	class DummyTestFS(AbstractFileSystem):
	5	protocol = "mock"
	6	_fs_contents = (
	7	{"name": "top_level/second_level/date=2019-10-01/", "type": "directory"},
	8	{
	9	"name": "top_level/second_level/date=2019-10-01/a.parquet",
	10	"type": "file",
	11	"size": 100,
	12	},
	13	{
	14	"name": "top_level/second_level/date=2019-10-01/b.parquet",
	15	"type": "file",
	16	"size": 100,
	17	},
	18	{"name": "top_level/second_level/date=2019-10-02/", "type": "directory"},
	19	{
	20	"name": "top_level/second_level/date=2019-10-02/a.parquet",
	21	"type": "file",
	22	"size": 100,
	23	},
	24	{"name": "top_level/second_level/date=2019-10-04/", "type": "directory"},
	25	{
	26	"name": "top_level/second_level/date=2019-10-04/a.parquet",
	27	"type": "file",
	28	"size": 100,
	29	},
	30	{"name": "misc/", "type": "directory"},
	31	{"name": "misc/foo.txt", "type": "file", "size": 100},
	32	)
	33
	34	def ls(self, path, detail=True, **kwargs):
	35	files = (file for file in self._fs_contents if path in file["name"])
	36
	37	if detail:
	38	return list(files)
	39
	40	return list(sorted([file["name"] for file in files]))
	41
	42
	43	@pytest.mark.parametrize(
	44	"test_path, expected",
	45	[
	46	(
	47	"mock://top_level/second_level/date=2019-10-01/a.parquet",
	48	["top_level/second_level/date=2019-10-01/a.parquet"],
	49	),
	50	(
	51	"mock://top_level/second_level/date=2019-10-01/*",
	52	[
	53	"top_level/second_level/date=2019-10-01/a.parquet",
	54	"top_level/second_level/date=2019-10-01/b.parquet",
	55	],
	56	),
	57	(
	58	"mock://top_level/second_level/date=2019-10-0[1-4]",
	59	[
	60	"top_level/second_level/date=2019-10-01",
	61	"top_level/second_level/date=2019-10-02",
	62	"top_level/second_level/date=2019-10-04",
	63	],
	64	),
	65	(
	66	"mock://top_level/second_level/date=2019-10-0[1-4]/*",
	67	[
	68	"top_level/second_level/date=2019-10-01/a.parquet",
	69	"top_level/second_level/date=2019-10-01/b.parquet",
	70	"top_level/second_level/date=2019-10-02/a.parquet",
	71	"top_level/second_level/date=2019-10-04/a.parquet",
	72	],
	73	),
	74	(
	75	"mock://top_level/second_level/date=2019-10-0[1-4]/[a].*",
	76	[
	77	"top_level/second_level/date=2019-10-01/a.parquet",
	78	"top_level/second_level/date=2019-10-02/a.parquet",
	79	"top_level/second_level/date=2019-10-04/a.parquet",
	80	],
	81	),
	82	],
	83	)
	84	def test_glob(test_path, expected):
	85	test_fs = DummyTestFS()
	86
	87	assert test_fs.glob(test_path) == expected
	88
	89
	90	def test_cache():
	91	fs = DummyTestFS()
	92	fs2 = DummyTestFS()
	93	assert fs is fs2
	94
	95	assert len(fs._cache) == 1
	96	del fs2
	97	assert len(fs._cache) == 1
	98	del fs
	99	assert len(DummyTestFS._cache) == 1
	100
	101	DummyTestFS.clear_instance_cache()
	102	assert len(DummyTestFS._cache) == 0
	103
	104
	105	def test_alias():
	106	with pytest.warns(FutureWarning, match="add_aliases"):
	107	DummyTestFS(add_aliases=True)
	108
	109
	110	def test_add_docs_warns():
	111	with pytest.warns(FutureWarning, match="add_docs"):
	112	AbstractFileSystem(add_docs=True)
	113
	114
	115	def test_cache_options():
	116	fs = DummyTestFS()
	117	f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes")
	118	assert f.cache.trim
	119
	120	# TODO: dummy buffered file
	121	f = AbstractBufferedFile(
	122	fs, "misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False)
	123	)
	124	assert f.cache.trim is False
	125
	126	f = fs.open("misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False))
	127	assert f.cache.trim is False
	128
	129
	130	def test_trim_kwarg_warns():
	131	fs = DummyTestFS()
	132	with pytest.warns(FutureWarning, match="cache_options"):
	133	AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", trim=False)
	134
	135
	136	def test_eq():
	137	fs = DummyTestFS()
	138	result = fs == 1
	139	assert result is False

+215

-0

fsspec/tests/test_utils.py less more

	0	import io
	1	import pytest
	2	from fsspec.utils import infer_storage_options, seek_delimiter, read_block
	3
	4
	5	def test_read_block():
	6	delimiter = b"\n"
	7	data = delimiter.join([b"123", b"456", b"789"])
	8	f = io.BytesIO(data)
	9
	10	assert read_block(f, 1, 2) == b"23"
	11	assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n"
	12	assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n"
	13	assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n"
	14	assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n"
	15	assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789"
	16	assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789"
	17	assert read_block(f, 1, 1, delimiter=b"\n") == b""
	18	assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n"
	19	assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789"
	20
	21	for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]:
	22	out = [read_block(f, o, l, b"\n") for o, l in ols]
	23	assert b"".join(filter(None, out)) == data
	24
	25
	26	def test_read_block_split_before():
	27	"""Test start/middle/end cases of split_before.""" # noqa: I
	28	d = (
	29	"#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000))
	30	).encode()
	31
	32	# Read single record at beginning.
	33	# All reads include beginning of file and read through termination of
	34	# delimited record.
	35	assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n"
	36	assert (
	37	read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True)
	38	== b"#header>foo0"
	39	)
	40	assert (
	41	read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>"
	42	)
	43	assert (
	44	read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True)
	45	== b"#header>foo0\nFOOBAR0\n"
	46	)
	47
	48	# Read multiple records at beginning.
	49	# All reads include beginning of file and read through termination of
	50	# delimited record.
	51	assert (
	52	read_block(io.BytesIO(d), 0, 27, delimiter=b"\n")
	53	== b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
	54	)
	55	assert (
	56	read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True)
	57	== b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1"
	58	)
	59	assert (
	60	read_block(io.BytesIO(d), 0, 27, delimiter=b">")
	61	== b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>"
	62	)
	63	assert (
	64	read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True)
	65	== b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
	66	)
	67
	68	# Read with offset spanning into next record, splits on either side of delimiter.
	69	# Read not spanning the full record returns nothing.
	70	assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n"
	71	assert (
	72	read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True)
	73	== b"\nFOOBAR0"
	74	)
	75	assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b""
	76	assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b""
	77
	78	# Read with offset spanning multiple records, splits on either side of delimiter
	79	assert (
	80	read_block(io.BytesIO(d), 10, 20, delimiter=b"\n")
	81	== b"FOOBAR0\n>foo1\nFOOBAR1\n"
	82	)
	83	assert (
	84	read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True)
	85	== b"\nFOOBAR0\n>foo1\nFOOBAR1"
	86	)
	87	assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>"
	88	assert (
	89	read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True)
	90	== b">foo1\nFOOBAR1\n"
	91	)
	92
	93	# Read record at end, all records read to end
	94
	95	tlen = len(d)
	96
	97	assert (
	98	read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n")
	99	== b">foo99999\nFOOBAR99999\n"
	100	)
	101
	102	assert (
	103	read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True)
	104	== b"\n>foo99999\nFOOBAR99999\n"
	105	)
	106
	107	assert (
	108	read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">")
	109	== b"foo99999\nFOOBAR99999\n"
	110	)
	111
	112	assert (
	113	read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True)
	114	== b">foo99999\nFOOBAR99999\n"
	115	)
	116
	117
	118	def test_seek_delimiter_endline():
	119	f = io.BytesIO(b"123\n456\n789")
	120
	121	# if at zero, stay at zero
	122	seek_delimiter(f, b"\n", 5)
	123	assert f.tell() == 0
	124
	125	# choose the first block
	126	for bs in [1, 5, 100]:
	127	f.seek(1)
	128	seek_delimiter(f, b"\n", blocksize=bs)
	129	assert f.tell() == 4
	130
	131	# handle long delimiters well, even with short blocksizes
	132	f = io.BytesIO(b"123abc456abc789")
	133	for bs in [1, 2, 3, 4, 5, 6, 10]:
	134	f.seek(1)
	135	seek_delimiter(f, b"abc", blocksize=bs)
	136	assert f.tell() == 6
	137
	138	# End at the end
	139	f = io.BytesIO(b"123\n456")
	140	f.seek(5)
	141	seek_delimiter(f, b"\n", 5)
	142	assert f.tell() == 7
	143
	144
	145	def test_infer_options():
	146	so = infer_storage_options("/mnt/datasets/test.csv")
	147	assert so.pop("protocol") == "file"
	148	assert so.pop("path") == "/mnt/datasets/test.csv"
	149	assert not so
	150
	151	assert infer_storage_options("./test.csv")["path"] == "./test.csv"
	152	assert infer_storage_options("../test.csv")["path"] == "../test.csv"
	153
	154	so = infer_storage_options("C:\\test.csv")
	155	assert so.pop("protocol") == "file"
	156	assert so.pop("path") == "C:\\test.csv"
	157	assert not so
	158
	159	assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv"
	160	assert infer_storage_options("\\test.csv")["path"] == "\\test.csv"
	161	assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv"
	162	assert infer_storage_options("test.csv")["path"] == "test.csv"
	163
	164	so = infer_storage_options(
	165	"hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm",
	166	inherit_storage_options={"extra": "value"},
	167	)
	168	assert so.pop("protocol") == "hdfs"
	169	assert so.pop("username") == "username"
	170	assert so.pop("password") == "pwd"
	171	assert so.pop("host") == "Node"
	172	assert so.pop("port") == 123
	173	assert so.pop("path") == "/mnt/datasets/test.csv#fragm"
	174	assert so.pop("url_query") == "q=1"
	175	assert so.pop("url_fragment") == "fragm"
	176	assert so.pop("extra") == "value"
	177	assert not so
	178
	179	so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv")
	180	assert so.pop("username") == "User-name"
	181	assert so.pop("host") == "Node-name.com"
	182
	183	u = "http://127.0.0.1:8080/test.csv"
	184	assert infer_storage_options(u) == {"protocol": "http", "path": u}
	185
	186	# For s3 and gcs the netloc is actually the bucket name, so we want to
	187	# include it in the path. Test that:
	188	# - Parsing doesn't lowercase the bucket
	189	# - The bucket is included in path
	190	for protocol in ["s3", "gcs", "gs"]:
	191	options = infer_storage_options("%s://Bucket-name.com/test.csv" % protocol)
	192	assert options["path"] == "Bucket-name.com/test.csv"
	193
	194	with pytest.raises(KeyError):
	195	infer_storage_options("file:///bucket/file.csv", {"path": "collide"})
	196	with pytest.raises(KeyError):
	197	infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"})
	198
	199
	200	@pytest.mark.parametrize(
	201	"urlpath, expected_path",
	202	(
	203	(r"c:\foo\bar", r"c:\foo\bar"),
	204	(r"C:\\foo\bar", r"C:\\foo\bar"),
	205	(r"c:/foo/bar", r"c:/foo/bar"),
	206	(r"file:///c\|\foo\bar", r"c:\foo\bar"),
	207	(r"file:///C\|/foo/bar", r"C:/foo/bar"),
	208	(r"file:///C:/foo/bar", r"C:/foo/bar"),
	209	),
	210	)
	211	def test_infer_storage_options_c(urlpath, expected_path):
	212	so = infer_storage_options(urlpath)
	213	assert so["protocol"] == "file"
	214	assert so["path"] == expected_path

+80

-0

fsspec/transaction.py less more

	0	class Transaction(object):
	1	"""Filesystem transaction write context
	2
	3	Gathers files for deferred commit or discard, so that several write
	4	operations can be finalized semi-atomically. This works by having this
	5	instance as the ``.transaction`` attribute of the given filesystem
	6	"""
	7
	8	def __init__(self, fs):
	9	"""
	10	Parameters
	11	----------
	12	fs: FileSystem instance
	13	"""
	14	self.fs = fs
	15	self.files = []
	16
	17	def __enter__(self):
	18	self.start()
	19
	20	def __exit__(self, exc_type, exc_val, exc_tb):
	21	"""End transaction and commit, if exit is not due to exception"""
	22	# only commit if there was no exception
	23	self.complete(commit=exc_type is None)
	24	self.fs._intrans = False
	25	self.fs._transaction = None
	26
	27	def start(self):
	28	"""Start a transaction on this FileSystem"""
	29	self.fs._intrans = True
	30
	31	def complete(self, commit=True):
	32	"""Finish transaction: commit or discard all deferred files"""
	33	for f in self.files:
	34	if commit:
	35	f.commit()
	36	else:
	37	f.discard()
	38	self.files = []
	39	self.fs._intrans = False
	40
	41
	42	class FileActor(object):
	43	def __init__(self):
	44	self.files = []
	45
	46	def commit(self):
	47	for f in self.files:
	48	f.commit()
	49	self.files.clear()
	50
	51	def discard(self):
	52	for f in self.files:
	53	f.discard()
	54	self.files.clear()
	55
	56	def append(self, f):
	57	self.files.append(f)
	58
	59
	60	class DaskTransaction(Transaction):
	61	def __init__(self, fs):
	62	"""
	63	Parameters
	64	----------
	65	fs: FileSystem instance
	66	"""
	67	import distributed
	68
	69	super().__init__(fs)
	70	client = distributed.default_client()
	71	self.files = client.submit(FileActor, actor=True).result()
	72
	73	def complete(self, commit=True):
	74	"""Finish transaction: commit or discard all deferred files"""
	75	if commit:
	76	self.files.commit().result()
	77	else:
	78	self.files.discard().result()
	79	self.fs._intrans = False

+304

-0

fsspec/utils.py less more

	0	from hashlib import md5
	1	import math
	2	import os
	3	import pathlib
	4	import re
	5	from urllib.parse import urlsplit
	6
	7
	8	DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
	9
	10
	11	def infer_storage_options(urlpath, inherit_storage_options=None):
	12	""" Infer storage options from URL path and merge it with existing storage
	13	options.
	14
	15	Parameters
	16	----------
	17	urlpath: str or unicode
	18	Either local absolute file path or URL (hdfs://namenode:8020/file.csv)
	19	inherit_storage_options: dict (optional)
	20	Its contents will get merged with the inferred information from the
	21	given path
	22
	23	Returns
	24	-------
	25	Storage options dict.
	26
	27	Examples
	28	--------
	29	>>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP
	30	{"protocol": "file", "path", "/mnt/datasets/test.csv"}
	31	>>> infer_storage_options(
	32	... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1',
	33	... inherit_storage_options={'extra': 'value'}) # doctest: +SKIP
	34	{"protocol": "hdfs", "username": "username", "password": "pwd",
	35	"host": "node", "port": 123, "path": "/mnt/datasets/test.csv",
	36	"url_query": "q=1", "extra": "value"}
	37	"""
	38	# Handle Windows paths including disk name in this special case
	39	if re.match(r"^[a-zA-Z]:[\\/]", urlpath):
	40	return {"protocol": "file", "path": urlpath}
	41
	42	parsed_path = urlsplit(urlpath)
	43	protocol = parsed_path.scheme or "file"
	44	if parsed_path.fragment:
	45	path = "#".join([parsed_path.path, parsed_path.fragment])
	46	else:
	47	path = parsed_path.path
	48	if protocol == "file":
	49	# Special case parsing file protocol URL on Windows according to:
	50	# https://msdn.microsoft.com/en-us/library/jj710207.aspx
	51	windows_path = re.match(r"^/([a-zA-Z])[:\|]([\\/].*)$", path)
	52	if windows_path:
	53	path = "%s:%s" % windows_path.groups()
	54
	55	if protocol in ["http", "https"]:
	56	# for HTTP, we don't want to parse, as requests will anyway
	57	return {"protocol": protocol, "path": urlpath}
	58
	59	options = {"protocol": protocol, "path": path}
	60
	61	if parsed_path.netloc:
	62	# Parse `hostname` from netloc manually because `parsed_path.hostname`
	63	# lowercases the hostname which is not always desirable (e.g. in S3):
	64	# https://github.com/dask/dask/issues/1417
	65	options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0]
	66
	67	if protocol in ("s3", "gcs", "gs"):
	68	options["path"] = options["host"] + options["path"]
	69	else:
	70	options["host"] = options["host"]
	71	if parsed_path.port:
	72	options["port"] = parsed_path.port
	73	if parsed_path.username:
	74	options["username"] = parsed_path.username
	75	if parsed_path.password:
	76	options["password"] = parsed_path.password
	77
	78	if parsed_path.query:
	79	options["url_query"] = parsed_path.query
	80	if parsed_path.fragment:
	81	options["url_fragment"] = parsed_path.fragment
	82
	83	if inherit_storage_options:
	84	update_storage_options(options, inherit_storage_options)
	85
	86	return options
	87
	88
	89	def update_storage_options(options, inherited=None):
	90	if not inherited:
	91	inherited = {}
	92	collisions = set(options) & set(inherited)
	93	if collisions:
	94	collisions = "\n".join("- %r" % k for k in collisions)
	95	raise KeyError(
	96	"Collision between inferred and specified storage "
	97	"options:\n%s" % collisions
	98	)
	99	options.update(inherited)
	100
	101
	102	# Compression extensions registered via fsspec.compression.register_compression
	103	compressions = {}
	104
	105
	106	def infer_compression(filename):
	107	"""Infer compression, if available, from filename.
	108
	109	Infer a named compression type, if registered and available, from filename
	110	extension. This includes builtin (gz, bz2, zip) compressions, as well as
	111	optional compressions. See fsspec.compression.register_compression.
	112	"""
	113	extension = os.path.splitext(filename)[-1].strip(".")
	114	if extension in compressions:
	115	return compressions[extension]
	116
	117
	118	def build_name_function(max_int):
	119	""" Returns a function that receives a single integer
	120	and returns it as a string padded by enough zero characters
	121	to align with maximum possible integer
	122
	123	>>> name_f = build_name_function(57)
	124
	125	>>> name_f(7)
	126	'07'
	127	>>> name_f(31)
	128	'31'
	129	>>> build_name_function(1000)(42)
	130	'0042'
	131	>>> build_name_function(999)(42)
	132	'042'
	133	>>> build_name_function(0)(0)
	134	'0'
	135	"""
	136	# handle corner cases max_int is 0 or exact power of 10
	137	max_int += 1e-8
	138
	139	pad_length = int(math.ceil(math.log10(max_int)))
	140
	141	def name_function(i):
	142	return str(i).zfill(pad_length)
	143
	144	return name_function
	145
	146
	147	def seek_delimiter(file, delimiter, blocksize):
	148	r"""Seek current file to file start, file end, or byte after delimiter seq.
	149
	150	Seeks file to next chunk delimiter, where chunks are defined on file start,
	151	a delimiting sequence, and file end. Use file.tell() to see location afterwards.
	152	Note that file start is a valid split, so must be at offset > 0 to seek for
	153	delimiter.
	154
	155	Parameters
	156	----------
	157	file: a file
	158	delimiter: bytes
	159	a delimiter like ``b'\n'`` or message sentinel, matching file .read() type
	160	blocksize: int
	161	Number of bytes to read from the file at once.
	162
	163
	164	Returns
	165	-------
	166	Returns True if a delimiter was found, False if at file start or end.
	167
	168	"""
	169
	170	if file.tell() == 0:
	171	# beginning-of-file, return without seek
	172	return False
	173
	174	# Interface is for binary IO, with delimiter as bytes, but initialize last
	175	# with result of file.read to preserve compatibility with text IO.
	176	last = None
	177	while True:
	178	current = file.read(blocksize)
	179	if not current:
	180	# end-of-file without delimiter
	181	return False
	182	full = last + current if last else current
	183	try:
	184	if delimiter in full:
	185	i = full.index(delimiter)
	186	file.seek(file.tell() - (len(full) - i) + len(delimiter))
	187	return True
	188	elif len(current) < blocksize:
	189	# end-of-file without delimiter
	190	return False
	191	except (OSError, ValueError):
	192	pass
	193	last = full[-len(delimiter) :]
	194
	195
	196	def read_block(f, offset, length, delimiter=None, split_before=False):
	197	""" Read a block of bytes from a file
	198
	199	Parameters
	200	----------
	201	f: File
	202	Open file
	203	offset: int
	204	Byte offset to start read
	205	length: int
	206	Number of bytes to read, read through end of file if None
	207	delimiter: bytes (optional)
	208	Ensure reading starts and stops at delimiter bytestring
	209	split_before: bool (optional)
	210	Start/stop read before delimiter bytestring.
	211
	212
	213	If using the ``delimiter=`` keyword argument we ensure that the read
	214	starts and stops at delimiter boundaries that follow the locations
	215	``offset`` and ``offset + length``. If ``offset`` is zero then we
	216	start at zero, regardless of delimiter. The bytestring returned WILL
	217	include the terminating delimiter string.
	218
	219	Examples
	220	--------
	221
	222	>>> from io import BytesIO # doctest: +SKIP
	223	>>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
	224	>>> read_block(f, 0, 13) # doctest: +SKIP
	225	b'Alice, 100\\nBo'
	226
	227	>>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
	228	b'Alice, 100\\nBob, 200\\n'
	229
	230	>>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
	231	b'Bob, 200\\nCharlie, 300'
	232	"""
	233	if delimiter:
	234	f.seek(offset)
	235	found_start_delim = seek_delimiter(f, delimiter, 2 ** 16)
	236	if length is None:
	237	return f.read()
	238	start = f.tell()
	239	length -= start - offset
	240
	241	f.seek(start + length)
	242	found_end_delim = seek_delimiter(f, delimiter, 2 ** 16)
	243	end = f.tell()
	244
	245	# Adjust split location to before delimiter iff seek found the
	246	# delimiter sequence, not start or end of file.
	247	if found_start_delim and split_before:
	248	start -= len(delimiter)
	249
	250	if found_end_delim and split_before:
	251	end -= len(delimiter)
	252
	253	offset = start
	254	length = end - start
	255
	256	f.seek(offset)
	257	b = f.read(length)
	258	return b
	259
	260
	261	def tokenize(args, *kwargs):
	262	""" Deterministic token
	263
	264	(modified from dask.base)
	265
	266	>>> tokenize([1, 2, '3'])
	267	'9d71491b50023b06fc76928e6eddb952'
	268
	269	>>> tokenize('Hello') == tokenize('Hello')
	270	True
	271	"""
	272	if kwargs:
	273	args += (kwargs,)
	274	return md5(str(args).encode()).hexdigest()
	275
	276
	277	def stringify_path(filepath):
	278	""" Attempt to convert a path-like object to a string.
	279
	280	Parameters
	281	----------
	282	filepath: object to be converted
	283
	284	Returns
	285	-------
	286	filepath_str: maybe a string version of the object
	287
	288	Notes
	289	-----
	290	Objects supporting the fspath protocol (Python 3.6+) are coerced
	291	according to its __fspath__ method.
	292
	293	For backwards compatibility with older Python version, pathlib.Path
	294	objects are specially coerced.
	295
	296	Any other object is passed through unchanged, which includes bytes,
	297	strings, buffers, or anything else that's not even path-like.
	298	"""
	299	if hasattr(filepath, "__fspath__"):
	300	return filepath.__fspath__()
	301	elif isinstance(filepath, pathlib.Path):
	302	return str(filepath)
	303	return filepath

-0

pyproject.toml less more

	0	[tool.black]
	1	# Revert to py34 target syntax to accomodate
	2	# errors in trailing commas.
	3	# https://github.com/psf/black/pull/763
	4	target_version = ['py34']

-0

readthedocs.yml less more

	0	conda:
	1	file: docs/environment.yml

-0

requirements.txt less more

(New empty file)

+22

-0

setup.cfg less more

	0	[metadata]
	1	long_description: file: README.rst
	2
	3	[versioneer]
	4	VCS = git
	5	style = pep440
	6	versionfile_source = fsspec/_version.py
	7	versionfile_build = fsspec/_version.py
	8	tag_prefix = ""
	9
	10	[flake8]
	11	exclude = .tox,build,docs/source/conf.py,versioneer.py
	12	max-line-length = 88
	13	ignore =
	14	# Assigning lambda expression
	15	E731
	16	# Ambiguous variable names
	17	E741
	18	# line break before binary operator
	19	W503
	20	# whitespace before :
	21	E203

+36

-0

setup.py less more

	0	#!/usr/bin/env python
	1	import os
	2
	3	from setuptools import setup
	4	import versioneer
	5
	6	here = os.path.abspath(os.path.dirname(__file__))
	7	with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
	8	long_description = f.read()
	9
	10	setup(
	11	name="fsspec",
	12	version=versioneer.get_version(),
	13	cmdclass=versioneer.get_cmdclass(),
	14	classifiers=[
	15	"Development Status :: 4 - Beta",
	16	"Intended Audience :: Developers",
	17	"License :: OSI Approved :: BSD License",
	18	"Operating System :: OS Independent",
	19	"Programming Language :: Python :: 3.5",
	20	"Programming Language :: Python :: 3.6",
	21	"Programming Language :: Python :: 3.7",
	22	],
	23	description="File-system specification",
	24	long_description=long_description,
	25	long_description_content_type="text/markdown",
	26	url="http://github.com/intake/filesystem_spec",
	27	maintainer="Martin Durant",
	28	maintainer_email="mdurant@anaconda.com",
	29	license="BSD",
	30	keywords="file",
	31	packages=["fsspec", "fsspec.implementations"],
	32	python_requires=">=3.5",
	33	install_requires=open("requirements.txt").read().strip().split("\n"),
	34	zip_safe=False,
	35	)

+135

-0

tox.ini less more

	0	# content of: tox.ini , put in same dir as setup.py
	1	[tox]
	2	envlist = {py35,py36,py37}
	3
	4	[core]
	5	conda_channels=
	6	defaults
	7	conda-forge
	8	conda_deps=
	9	pip
	10	paramiko
	11	requests
	12	zstandard
	13	python-snappy
	14	lz4
	15	distributed
	16	dask
	17	pyarrow
	18	pyftpdlib
	19	cloudpickle
	20	pytest
	21	pytest-cov
	22	fusepy==3.0.1
	23	deps=
	24	hadoop-test-cluster==0.1.0
	25
	26	[dev]
	27	conda_deps=
	28	conda-forge::pre-commit=1.18
	29	black=19.3b0
	30	flake8
	31	deps=
	32
	33	[testenv]
	34	description=Run test suite against target versions.
	35	conda_channels=
	36	{[core]conda_channels}
	37	conda_deps=
	38	{[core]conda_deps}
	39	deps=
	40	{[core]deps}
	41	commands =
	42	py.test -v -r s
	43
	44	[testenv:coverage]
	45	description=Run test suite with coverage enabled.
	46	basepython=python3.7
	47	conda_channels=
	48	{[core]conda_channels}
	49	conda_deps=
	50	{[core]conda_deps}
	51	deps=
	52	{[core]deps}
	53	commands =
	54	py.test --cov=fsspec -v -r s
	55
	56	[testenv:dev]
	57	description=Setup conda dev env under '.tox/dev'.
	58	basepython=python3.7
	59	usedevelop=True
	60	conda_channels=
	61	{[core]conda_channels}
	62	conda_deps=
	63	{[core]conda_deps}
	64	{[dev]conda_deps}
	65	deps=
	66	{[core]deps}
	67	{[dev]deps}
	68	commands =
	69
	70	[testenv:lint]
	71	description=Run pre-commit checks.
	72	basepython=python3.7
	73	skip_install=True
	74	conda_deps=
	75	{[dev]conda_deps}
	76	deps=
	77	{[dev]deps}
	78	commands_pre=
	79	pre-commit install --install-hooks
	80	commands=
	81	pre-commit run --all-files --show-diff-on-failure
	82
	83	[testenv:s3fs]
	84	description=Run s3fs (@master) test suite against fsspec.
	85	conda_channels=
	86	defaults
	87	conda-forge
	88	conda_deps=
	89	{[core]conda_deps}
	90	boto3
	91	botocore
	92	httpretty
	93	moto
	94	six
	95	mock
	96	deps=
	97	{[core]deps}
	98	changedir=.tox/s3fs/tmp
	99	whitelist_externals=
	100	rm
	101	git
	102	setenv=
	103	BOTO_CONFIG=/dev/null
	104	AWS_ACCESS_KEY_ID=foobar_key
	105	AWS_SECRET_ACCESS_KEY=foobar_secret
	106	commands=
	107	rm -rf s3fs
	108	git clone https://github.com/dask/s3fs
	109	py.test -vv s3fs/s3fs
	110
	111	[testenv:gcsfs]
	112	description=Run gcsfs (@master) test suite against fsspec.
	113	conda_channels=
	114	defaults
	115	conda-forge
	116	conda_deps=
	117	{[core]conda_deps}
	118	requests
	119	decorator
	120	google-auth
	121	deps=
	122	{[core]deps}
	123	vcrpy
	124	google-auth-oauthlib
	125	changedir=.tox/gcsfs/tmp
	126	whitelist_externals=
	127	rm
	128	git
	129	setenv=
	130	GCSFS_RECORD_MODE=none
	131	commands=
	132	rm -rf gcsfs
	133	git clone https://github.com/dask/gcsfs
	134	py.test -vv gcsfs/gcsfs -k 'not fuse'

+1885

-0

versioneer.py less more

	0	# Version: 0.18
	1
	2	"""The Versioneer - like a rocketeer, but for versions.
	3
	4	The Versioneer
	5	==============
	6
	7	* like a rocketeer, but for versions!
	8	* https://github.com/warner/python-versioneer
	9	* Brian Warner
	10	* License: Public Domain
	11	* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
	12	* [![Latest Version]
	13	(https://pypip.in/version/versioneer/badge.svg?style=flat)
	14	](https://pypi.python.org/pypi/versioneer/)
	15	* [![Build Status]
	16	(https://travis-ci.org/warner/python-versioneer.png?branch=master)
	17	](https://travis-ci.org/warner/python-versioneer)
	18
	19	This is a tool for managing a recorded version number in distutils-based
	20	python projects. The goal is to remove the tedious and error-prone "update
	21	the embedded version string" step from your release process. Making a new
	22	release should be as easy as recording a new tag in your version-control
	23	system, and maybe making new tarballs.
	24
	25
	26	## Quick Install
	27
	28	* `pip install versioneer` to somewhere to your $PATH
	29	* add a `[versioneer]` section to your setup.cfg (see below)
	30	* run `versioneer install` in your source tree, commit the results
	31
	32	## Version Identifiers
	33
	34	Source trees come from a variety of places:
	35
	36	* a version-control system checkout (mostly used by developers)
	37	* a nightly tarball, produced by build automation
	38	* a snapshot tarball, produced by a web-based VCS browser, like github's
	39	"tarball from tag" feature
	40	* a release tarball, produced by "setup.py sdist", distributed through PyPI
	41
	42	Within each source tree, the version identifier (either a string or a number,
	43	this tool is format-agnostic) can come from a variety of places:
	44
	45	* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
	46	about recent "tags" and an absolute revision-id
	47	* the name of the directory into which the tarball was unpacked
	48	* an expanded VCS keyword ($Id$, etc)
	49	* a `_version.py` created by some earlier build step
	50
	51	For released software, the version identifier is closely related to a VCS
	52	tag. Some projects use tag names that include more than just the version
	53	string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
	54	needs to strip the tag prefix to extract the version identifier. For
	55	unreleased software (between tags), the version identifier should provide
	56	enough information to help developers recreate the same tree, while also
	57	giving them an idea of roughly how old the tree is (after version 1.2, before
	58	version 1.3). Many VCS systems can report a description that captures this,
	59	for example `git describe --tags --dirty --always` reports things like
	60	"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
	61	0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
	62	uncommitted changes.
	63
	64	The version identifier is used for multiple purposes:
	65
	66	* to allow the module to self-identify its version: `myproject.__version__`
	67	* to choose a name and prefix for a 'setup.py sdist' tarball
	68
	69	## Theory of Operation
	70
	71	Versioneer works by adding a special `_version.py` file into your source
	72	tree, where your `__init__.py` can import it. This `_version.py` knows how to
	73	dynamically ask the VCS tool for version information at import time.
	74
	75	`_version.py` also contains `$Revision$` markers, and the installation
	76	process marks `_version.py` to have this marker rewritten with a tag name
	77	during the `git archive` command. As a result, generated tarballs will
	78	contain enough information to get the proper version.
	79
	80	To allow `setup.py` to compute a version too, a `versioneer.py` is added to
	81	the top level of your source tree, next to `setup.py` and the `setup.cfg`
	82	that configures it. This overrides several distutils/setuptools commands to
	83	compute the version when invoked, and changes `setup.py build` and `setup.py
	84	sdist` to replace `_version.py` with a small static file that contains just
	85	the generated version data.
	86
	87	## Installation
	88
	89	See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
	90
	91	## Version-String Flavors
	92
	93	Code which uses Versioneer can learn about its version string at runtime by
	94	importing `_version` from your main `__init__.py` file and running the
	95	`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
	96	import the top-level `versioneer.py` and run `get_versions()`.
	97
	98	Both functions return a dictionary with different flavors of version
	99	information:
	100
	101	* `['version']`: A condensed version string, rendered using the selected
	102	style. This is the most commonly used value for the project's version
	103	string. The default "pep440" style yields strings like `0.11`,
	104	`0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
	105	below for alternative styles.
	106
	107	* `['full-revisionid']`: detailed revision identifier. For Git, this is the
	108	full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
	109
	110	* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
	111	commit date in ISO 8601 format. This will be None if the date is not
	112	available.
	113
	114	* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
	115	this is only accurate if run in a VCS checkout, otherwise it is likely to
	116	be False or None
	117
	118	* `['error']`: if the version string could not be computed, this will be set
	119	to a string describing the problem, otherwise it will be None. It may be
	120	useful to throw an exception in setup.py if this is set, to avoid e.g.
	121	creating tarballs with a version string of "unknown".
	122
	123	Some variants are more useful than others. Including `full-revisionid` in a
	124	bug report should allow developers to reconstruct the exact code being tested
	125	(or indicate the presence of local changes that should be shared with the
	126	developers). `version` is suitable for display in an "about" box or a CLI
	127	`--version` output: it can be easily compared against release notes and lists
	128	of bugs fixed in various releases.
	129
	130	The installer adds the following text to your `__init__.py` to place a basic
	131	version in `YOURPROJECT.__version__`:
	132
	133	from ._version import get_versions
	134	__version__ = get_versions()['version']
	135	del get_versions
	136
	137	## Styles
	138
	139	The setup.cfg `style=` configuration controls how the VCS information is
	140	rendered into a version string.
	141
	142	The default style, "pep440", produces a PEP440-compliant string, equal to the
	143	un-prefixed tag name for actual releases, and containing an additional "local
	144	version" section with more detail for in-between builds. For Git, this is
	145	TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
	146	--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
	147	tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
	148	that this commit is two revisions ("+2") beyond the "0.11" tag. For released
	149	software (exactly equal to a known tag), the identifier will only contain the
	150	stripped tag, e.g. "0.11".
	151
	152	Other styles are available. See [details.md](details.md) in the Versioneer
	153	source tree for descriptions.
	154
	155	## Debugging
	156
	157	Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
	158	to return a version of "0+unknown". To investigate the problem, run `setup.py
	159	version`, which will run the version-lookup code in a verbose mode, and will
	160	display the full contents of `get_versions()` (including the `error` string,
	161	which may help identify what went wrong).
	162
	163	## Known Limitations
	164
	165	Some situations are known to cause problems for Versioneer. This details the
	166	most significant ones. More can be found on Github
	167	[issues page](https://github.com/warner/python-versioneer/issues).
	168
	169	### Subprojects
	170
	171	Versioneer has limited support for source trees in which `setup.py` is not in
	172	the root directory (e.g. `setup.py` and `.git/` are not siblings). The are
	173	two common reasons why `setup.py` might not be in the root:
	174
	175	* Source trees which contain multiple subprojects, such as
	176	[Buildbot](https://github.com/buildbot/buildbot), which contains both
	177	"master" and "slave" subprojects, each with their own `setup.py`,
	178	`setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
	179	distributions (and upload multiple independently-installable tarballs).
	180	* Source trees whose main purpose is to contain a C library, but which also
	181	provide bindings to Python (and perhaps other langauges) in subdirectories.
	182
	183	Versioneer will look for `.git` in parent directories, and most operations
	184	should get the right version string. However `pip` and `setuptools` have bugs
	185	and implementation details which frequently cause `pip install .` from a
	186	subproject directory to fail to find a correct version string (so it usually
	187	defaults to `0+unknown`).
	188
	189	`pip install --editable .` should work correctly. `setup.py install` might
	190	work too.
	191
	192	Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
	193	some later version.
	194
	195	[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
	196	this issue. The discussion in
	197	[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
	198	issue from the Versioneer side in more detail.
	199	[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
	200	[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
	201	pip to let Versioneer work correctly.
	202
	203	Versioneer-0.16 and earlier only looked for a `.git` directory next to the
	204	`setup.cfg`, so subprojects were completely unsupported with those releases.
	205
	206	### Editable installs with setuptools <= 18.5
	207
	208	`setup.py develop` and `pip install --editable .` allow you to install a
	209	project into a virtualenv once, then continue editing the source code (and
	210	test) without re-installing after every change.
	211
	212	"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
	213	convenient way to specify executable scripts that should be installed along
	214	with the python package.
	215
	216	These both work as expected when using modern setuptools. When using
	217	setuptools-18.5 or earlier, however, certain operations will cause
	218	`pkg_resources.DistributionNotFound` errors when running the entrypoint
	219	script, which must be resolved by re-installing the package. This happens
	220	when the install happens with one version, then the egg_info data is
	221	regenerated while a different version is checked out. Many setup.py commands
	222	cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
	223	a different virtualenv), so this can be surprising.
	224
	225	[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
	226	this one, but upgrading to a newer version of setuptools should probably
	227	resolve it.
	228
	229	### Unicode version strings
	230
	231	While Versioneer works (and is continually tested) with both Python 2 and
	232	Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
	233	Newer releases probably generate unicode version strings on py2. It's not
	234	clear that this is wrong, but it may be surprising for applications when then
	235	write these strings to a network connection or include them in bytes-oriented
	236	APIs like cryptographic checksums.
	237
	238	[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
	239	this question.
	240
	241
	242	## Updating Versioneer
	243
	244	To upgrade your project to a new release of Versioneer, do the following:
	245
	246	* install the new Versioneer (`pip install -U versioneer` or equivalent)
	247	* edit `setup.cfg`, if necessary, to include any new configuration settings
	248	indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
	249	* re-run `versioneer install` in your source tree, to replace
	250	`SRC/_version.py`
	251	* commit any changed files
	252
	253	## Future Directions
	254
	255	This tool is designed to make it easily extended to other version-control
	256	systems: all VCS-specific components are in separate directories like
	257	src/git/ . The top-level `versioneer.py` script is assembled from these
	258	components by running make-versioneer.py . In the future, make-versioneer.py
	259	will take a VCS name as an argument, and will construct a version of
	260	`versioneer.py` that is specific to the given VCS. It might also take the
	261	configuration arguments that are currently provided manually during
	262	installation by editing setup.py . Alternatively, it might go the other
	263	direction and include code from all supported VCS systems, reducing the
	264	number of intermediate scripts.
	265
	266
	267	## License
	268
	269	To make Versioneer easier to embed, all its code is dedicated to the public
	270	domain. The `_version.py` that it creates is also in the public domain.
	271	Specifically, both are released under the Creative Commons "Public Domain
	272	Dedication" license (CC0-1.0), as described in
	273	https://creativecommons.org/publicdomain/zero/1.0/ .
	274
	275	"""
	276
	277	from __future__ import print_function
	278
	279	try:
	280	import configparser
	281	except ImportError:
	282	import ConfigParser as configparser
	283	import errno
	284	import json
	285	import os
	286	import re
	287	import subprocess
	288	import sys
	289
	290
	291	class VersioneerConfig:
	292	"""Container for Versioneer configuration parameters."""
	293
	294
	295	def get_root():
	296	"""Get the project root directory.
	297
	298	We require that all commands are run from the project root, i.e. the
	299	directory that contains setup.py, setup.cfg, and versioneer.py .
	300	"""
	301	root = os.path.realpath(os.path.abspath(os.getcwd()))
	302	setup_py = os.path.join(root, "setup.py")
	303	versioneer_py = os.path.join(root, "versioneer.py")
	304	if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
	305	# allow 'python path/to/setup.py COMMAND'
	306	root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
	307	setup_py = os.path.join(root, "setup.py")
	308	versioneer_py = os.path.join(root, "versioneer.py")
	309	if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
	310	err = (
	311	"Versioneer was unable to run the project root directory. "
	312	"Versioneer requires setup.py to be executed from "
	313	"its immediate directory (like 'python setup.py COMMAND'), "
	314	"or in a way that lets it use sys.argv[0] to find the root "
	315	"(like 'python path/to/setup.py COMMAND')."
	316	)
	317	raise VersioneerBadRootError(err)
	318	try:
	319	# Certain runtime workflows (setup.py install/develop in a setuptools
	320	# tree) execute all dependencies in a single python process, so
	321	# "versioneer" may be imported multiple times, and python's shared
	322	# module-import table will cache the first one. So we can't use
	323	# os.path.dirname(__file__), as that will find whichever
	324	# versioneer.py was first imported, even in later projects.
	325	me = os.path.realpath(os.path.abspath(__file__))
	326	me_dir = os.path.normcase(os.path.splitext(me)[0])
	327	vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
	328	if me_dir != vsr_dir:
	329	print(
	330	"Warning: build in %s is using versioneer.py from %s"
	331	% (os.path.dirname(me), versioneer_py)
	332	)
	333	except NameError:
	334	pass
	335	return root
	336
	337
	338	def get_config_from_root(root):
	339	"""Read the project setup.cfg file to determine Versioneer config."""
	340	# This might raise EnvironmentError (if setup.cfg is missing), or
	341	# configparser.NoSectionError (if it lacks a [versioneer] section), or
	342	# configparser.NoOptionError (if it lacks "VCS="). See the docstring at
	343	# the top of versioneer.py for instructions on writing your setup.cfg .
	344	setup_cfg = os.path.join(root, "setup.cfg")
	345	parser = configparser.SafeConfigParser()
	346	with open(setup_cfg, "r") as f:
	347	parser.readfp(f)
	348	VCS = parser.get("versioneer", "VCS") # mandatory
	349
	350	def get(parser, name):
	351	if parser.has_option("versioneer", name):
	352	return parser.get("versioneer", name)
	353	return None
	354
	355	cfg = VersioneerConfig()
	356	cfg.VCS = VCS
	357	cfg.style = get(parser, "style") or ""
	358	cfg.versionfile_source = get(parser, "versionfile_source")
	359	cfg.versionfile_build = get(parser, "versionfile_build")
	360	cfg.tag_prefix = get(parser, "tag_prefix")
	361	if cfg.tag_prefix in ("''", '""'):
	362	cfg.tag_prefix = ""
	363	cfg.parentdir_prefix = get(parser, "parentdir_prefix")
	364	cfg.verbose = get(parser, "verbose")
	365	return cfg
	366
	367
	368	class NotThisMethod(Exception):
	369	"""Exception raised if a method is not valid for the current scenario."""
	370
	371
	372	# these dictionaries contain VCS-specific tools
	373	LONG_VERSION_PY = {}
	374	HANDLERS = {}
	375
	376
	377	def register_vcs_handler(vcs, method): # decorator
	378	"""Decorator to mark a method as the handler for a particular VCS."""
	379
	380	def decorate(f):
	381	"""Store f in HANDLERS[vcs][method]."""
	382	if vcs not in HANDLERS:
	383	HANDLERS[vcs] = {}
	384	HANDLERS[vcs][method] = f
	385	return f
	386
	387	return decorate
	388
	389
	390	def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
	391	"""Call the given command(s)."""
	392	assert isinstance(commands, list)
	393	p = None
	394	for c in commands:
	395	try:
	396	dispcmd = str([c] + args)
	397	# remember shell=False, so use git.cmd on windows, not just git
	398	p = subprocess.Popen(
	399	[c] + args,
	400	cwd=cwd,
	401	env=env,
	402	stdout=subprocess.PIPE,
	403	stderr=(subprocess.PIPE if hide_stderr else None),
	404	)
	405	break
	406	except EnvironmentError:
	407	e = sys.exc_info()[1]
	408	if e.errno == errno.ENOENT:
	409	continue
	410	if verbose:
	411	print("unable to run %s" % dispcmd)
	412	print(e)
	413	return None, None
	414	else:
	415	if verbose:
	416	print("unable to find command, tried %s" % (commands,))
	417	return None, None
	418	stdout = p.communicate()[0].strip()
	419	if sys.version_info[0] >= 3:
	420	stdout = stdout.decode()
	421	if p.returncode != 0:
	422	if verbose:
	423	print("unable to run %s (error)" % dispcmd)
	424	print("stdout was %s" % stdout)
	425	return None, p.returncode
	426	return stdout, p.returncode
	427
	428
	429	LONG_VERSION_PY[
	430	"git"
	431	] = '''
	432	# This file helps to compute a version number in source trees obtained from
	433	# git-archive tarball (such as those provided by githubs download-from-tag
	434	# feature). Distribution tarballs (built by setup.py sdist) and build
	435	# directories (produced by setup.py build) will contain a much shorter file
	436	# that just contains the computed version number.
	437
	438	# This file is released into the public domain. Generated by
	439	# versioneer-0.18 (https://github.com/warner/python-versioneer)
	440
	441	"""Git implementation of _version.py."""
	442
	443	import errno
	444	import os
	445	import re
	446	import subprocess
	447	import sys
	448
	449
	450	def get_keywords():
	451	"""Get the keywords needed to look up the version information."""
	452	# these strings will be replaced by git during git-archive.
	453	# setup.py/versioneer.py will grep for the variable names, so they must
	454	# each be defined on a line of their own. _version.py will just call
	455	# get_keywords().
	456	git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
	457	git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
	458	git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
	459	keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
	460	return keywords
	461
	462
	463	class VersioneerConfig:
	464	"""Container for Versioneer configuration parameters."""
	465
	466
	467	def get_config():
	468	"""Create, populate and return the VersioneerConfig() object."""
	469	# these strings are filled in when 'setup.py versioneer' creates
	470	# _version.py
	471	cfg = VersioneerConfig()
	472	cfg.VCS = "git"
	473	cfg.style = "%(STYLE)s"
	474	cfg.tag_prefix = "%(TAG_PREFIX)s"
	475	cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
	476	cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
	477	cfg.verbose = False
	478	return cfg
	479
	480
	481	class NotThisMethod(Exception):
	482	"""Exception raised if a method is not valid for the current scenario."""
	483
	484
	485	LONG_VERSION_PY = {}
	486	HANDLERS = {}
	487
	488
	489	def register_vcs_handler(vcs, method): # decorator
	490	"""Decorator to mark a method as the handler for a particular VCS."""
	491	def decorate(f):
	492	"""Store f in HANDLERS[vcs][method]."""
	493	if vcs not in HANDLERS:
	494	HANDLERS[vcs] = {}
	495	HANDLERS[vcs][method] = f
	496	return f
	497	return decorate
	498
	499
	500	def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
	501	env=None):
	502	"""Call the given command(s)."""
	503	assert isinstance(commands, list)
	504	p = None
	505	for c in commands:
	506	try:
	507	dispcmd = str([c] + args)
	508	# remember shell=False, so use git.cmd on windows, not just git
	509	p = subprocess.Popen([c] + args, cwd=cwd, env=env,
	510	stdout=subprocess.PIPE,
	511	stderr=(subprocess.PIPE if hide_stderr
	512	else None))
	513	break
	514	except EnvironmentError:
	515	e = sys.exc_info()[1]
	516	if e.errno == errno.ENOENT:
	517	continue
	518	if verbose:
	519	print("unable to run %%s" %% dispcmd)
	520	print(e)
	521	return None, None
	522	else:
	523	if verbose:
	524	print("unable to find command, tried %%s" %% (commands,))
	525	return None, None
	526	stdout = p.communicate()[0].strip()
	527	if sys.version_info[0] >= 3:
	528	stdout = stdout.decode()
	529	if p.returncode != 0:
	530	if verbose:
	531	print("unable to run %%s (error)" %% dispcmd)
	532	print("stdout was %%s" %% stdout)
	533	return None, p.returncode
	534	return stdout, p.returncode
	535
	536
	537	def versions_from_parentdir(parentdir_prefix, root, verbose):
	538	"""Try to determine the version from the parent directory name.
	539
	540	Source tarballs conventionally unpack into a directory that includes both
	541	the project name and a version string. We will also support searching up
	542	two directory levels for an appropriately named parent directory
	543	"""
	544	rootdirs = []
	545
	546	for i in range(3):
	547	dirname = os.path.basename(root)
	548	if dirname.startswith(parentdir_prefix):
	549	return {"version": dirname[len(parentdir_prefix):],
	550	"full-revisionid": None,
	551	"dirty": False, "error": None, "date": None}
	552	else:
	553	rootdirs.append(root)
	554	root = os.path.dirname(root) # up a level
	555
	556	if verbose:
	557	print("Tried directories %%s but none started with prefix %%s" %%
	558	(str(rootdirs), parentdir_prefix))
	559	raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
	560
	561
	562	@register_vcs_handler("git", "get_keywords")
	563	def git_get_keywords(versionfile_abs):
	564	"""Extract version information from the given file."""
	565	# the code embedded in _version.py can just fetch the value of these
	566	# keywords. When used from setup.py, we don't want to import _version.py,
	567	# so we do it with a regexp instead. This function is not used from
	568	# _version.py.
	569	keywords = {}
	570	try:
	571	f = open(versionfile_abs, "r")
	572	for line in f.readlines():
	573	if line.strip().startswith("git_refnames ="):
	574	mo = re.search(r'=\s"(.)"', line)
	575	if mo:
	576	keywords["refnames"] = mo.group(1)
	577	if line.strip().startswith("git_full ="):
	578	mo = re.search(r'=\s"(.)"', line)
	579	if mo:
	580	keywords["full"] = mo.group(1)
	581	if line.strip().startswith("git_date ="):
	582	mo = re.search(r'=\s"(.)"', line)
	583	if mo:
	584	keywords["date"] = mo.group(1)
	585	f.close()
	586	except EnvironmentError:
	587	pass
	588	return keywords
	589
	590
	591	@register_vcs_handler("git", "keywords")
	592	def git_versions_from_keywords(keywords, tag_prefix, verbose):
	593	"""Get version information from git keywords."""
	594	if not keywords:
	595	raise NotThisMethod("no keywords at all, weird")
	596	date = keywords.get("date")
	597	if date is not None:
	598	# git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
	599	# datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
	600	# -like" string, which we must then edit to make compliant), because
	601	# it's been around since git-1.5.3, and it's too difficult to
	602	# discover which version we're using, or to work around using an
	603	# older one.
	604	date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
	605	refnames = keywords["refnames"].strip()
	606	if refnames.startswith("$Format"):
	607	if verbose:
	608	print("keywords are unexpanded, not using")
	609	raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
	610	refs = set([r.strip() for r in refnames.strip("()").split(",")])
	611	# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
	612	# just "foo-1.0". If we see a "tag: " prefix, prefer those.
	613	TAG = "tag: "
	614	tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
	615	if not tags:
	616	# Either we're using git < 1.8.3, or there really are no tags. We use
	617	# a heuristic: assume all version tags have a digit. The old git %%d
	618	# expansion behaves like git log --decorate=short and strips out the
	619	# refs/heads/ and refs/tags/ prefixes that would let us distinguish
	620	# between branches and tags. By ignoring refnames without digits, we
	621	# filter out many common branch names like "release" and
	622	# "stabilization", as well as "HEAD" and "master".
	623	tags = set([r for r in refs if re.search(r'\d', r)])
	624	if verbose:
	625	print("discarding '%%s', no digits" %% ",".join(refs - tags))
	626	if verbose:
	627	print("likely tags: %%s" %% ",".join(sorted(tags)))
	628	for ref in sorted(tags):
	629	# sorting will prefer e.g. "2.0" over "2.0rc1"
	630	if ref.startswith(tag_prefix):
	631	r = ref[len(tag_prefix):]
	632	if verbose:
	633	print("picking %%s" %% r)
	634	return {"version": r,
	635	"full-revisionid": keywords["full"].strip(),
	636	"dirty": False, "error": None,
	637	"date": date}
	638	# no suitable tags, so version is "0+unknown", but full hex is still there
	639	if verbose:
	640	print("no suitable tags, using unknown + full revision id")
	641	return {"version": "0+unknown",
	642	"full-revisionid": keywords["full"].strip(),
	643	"dirty": False, "error": "no suitable tags", "date": None}
	644
	645
	646	@register_vcs_handler("git", "pieces_from_vcs")
	647	def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
	648	"""Get version from 'git describe' in the root of the source tree.
	649
	650	This only gets called if the git-archive 'subst' keywords were not
	651	expanded, and _version.py hasn't already been rewritten with a short
	652	version string, meaning we're inside a checked out source tree.
	653	"""
	654	GITS = ["git"]
	655	if sys.platform == "win32":
	656	GITS = ["git.cmd", "git.exe"]
	657
	658	out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
	659	hide_stderr=True)
	660	if rc != 0:
	661	if verbose:
	662	print("Directory %%s not under git control" %% root)
	663	raise NotThisMethod("'git rev-parse --git-dir' returned error")
	664
	665	# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
	666	# if there isn't one, this yields HEX[-dirty] (no NUM)
	667	describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
	668	"--always", "--long",
	669	"--match", "%%s*" %% tag_prefix],
	670	cwd=root)
	671	# --long was added in git-1.5.5
	672	if describe_out is None:
	673	raise NotThisMethod("'git describe' failed")
	674	describe_out = describe_out.strip()
	675	full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
	676	if full_out is None:
	677	raise NotThisMethod("'git rev-parse' failed")
	678	full_out = full_out.strip()
	679
	680	pieces = {}
	681	pieces["long"] = full_out
	682	pieces["short"] = full_out[:7] # maybe improved later
	683	pieces["error"] = None
	684
	685	# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
	686	# TAG might have hyphens.
	687	git_describe = describe_out
	688
	689	# look for -dirty suffix
	690	dirty = git_describe.endswith("-dirty")
	691	pieces["dirty"] = dirty
	692	if dirty:
	693	git_describe = git_describe[:git_describe.rindex("-dirty")]
	694
	695	# now we have TAG-NUM-gHEX or HEX
	696
	697	if "-" in git_describe:
	698	# TAG-NUM-gHEX
	699	mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
	700	if not mo:
	701	# unparseable. Maybe git-describe is misbehaving?
	702	pieces["error"] = ("unable to parse git-describe output: '%%s'"
	703	%% describe_out)
	704	return pieces
	705
	706	# tag
	707	full_tag = mo.group(1)
	708	if not full_tag.startswith(tag_prefix):
	709	if verbose:
	710	fmt = "tag '%%s' doesn't start with prefix '%%s'"
	711	print(fmt %% (full_tag, tag_prefix))
	712	pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
	713	%% (full_tag, tag_prefix))
	714	return pieces
	715	pieces["closest-tag"] = full_tag[len(tag_prefix):]
	716
	717	# distance: number of commits since tag
	718	pieces["distance"] = int(mo.group(2))
	719
	720	# commit: short hex revision ID
	721	pieces["short"] = mo.group(3)
	722
	723	else:
	724	# HEX: no tags
	725	pieces["closest-tag"] = None
	726	count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
	727	cwd=root)
	728	pieces["distance"] = int(count_out) # total number of commits
	729
	730	# commit date: see ISO-8601 comment in git_versions_from_keywords()
	731	date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
	732	cwd=root)[0].strip()
	733	pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
	734
	735	return pieces
	736
	737
	738	def plus_or_dot(pieces):
	739	"""Return a + if we don't already have one, else return a ."""
	740	if "+" in pieces.get("closest-tag", ""):
	741	return "."
	742	return "+"
	743
	744
	745	def render_pep440(pieces):
	746	"""Build up version string, with post-release "local version identifier".
	747
	748	Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
	749	get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
	750
	751	Exceptions:
	752	1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
	753	"""
	754	if pieces["closest-tag"]:
	755	rendered = pieces["closest-tag"]
	756	if pieces["distance"] or pieces["dirty"]:
	757	rendered += plus_or_dot(pieces)
	758	rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
	759	if pieces["dirty"]:
	760	rendered += ".dirty"
	761	else:
	762	# exception #1
	763	rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
	764	pieces["short"])
	765	if pieces["dirty"]:
	766	rendered += ".dirty"
	767	return rendered
	768
	769
	770	def render_pep440_pre(pieces):
	771	"""TAG[.post.devDISTANCE] -- No -dirty.
	772
	773	Exceptions:
	774	1: no tags. 0.post.devDISTANCE
	775	"""
	776	if pieces["closest-tag"]:
	777	rendered = pieces["closest-tag"]
	778	if pieces["distance"]:
	779	rendered += ".post.dev%%d" %% pieces["distance"]
	780	else:
	781	# exception #1
	782	rendered = "0.post.dev%%d" %% pieces["distance"]
	783	return rendered
	784
	785
	786	def render_pep440_post(pieces):
	787	"""TAG[.postDISTANCE[.dev0]+gHEX] .
	788
	789	The ".dev0" means dirty. Note that .dev0 sorts backwards
	790	(a dirty tree will appear "older" than the corresponding clean one),
	791	but you shouldn't be releasing software with -dirty anyways.
	792
	793	Exceptions:
	794	1: no tags. 0.postDISTANCE[.dev0]
	795	"""
	796	if pieces["closest-tag"]:
	797	rendered = pieces["closest-tag"]
	798	if pieces["distance"] or pieces["dirty"]:
	799	rendered += ".post%%d" %% pieces["distance"]
	800	if pieces["dirty"]:
	801	rendered += ".dev0"
	802	rendered += plus_or_dot(pieces)
	803	rendered += "g%%s" %% pieces["short"]
	804	else:
	805	# exception #1
	806	rendered = "0.post%%d" %% pieces["distance"]
	807	if pieces["dirty"]:
	808	rendered += ".dev0"
	809	rendered += "+g%%s" %% pieces["short"]
	810	return rendered
	811
	812
	813	def render_pep440_old(pieces):
	814	"""TAG[.postDISTANCE[.dev0]] .
	815
	816	The ".dev0" means dirty.
	817
	818	Eexceptions:
	819	1: no tags. 0.postDISTANCE[.dev0]
	820	"""
	821	if pieces["closest-tag"]:
	822	rendered = pieces["closest-tag"]
	823	if pieces["distance"] or pieces["dirty"]:
	824	rendered += ".post%%d" %% pieces["distance"]
	825	if pieces["dirty"]:
	826	rendered += ".dev0"
	827	else:
	828	# exception #1
	829	rendered = "0.post%%d" %% pieces["distance"]
	830	if pieces["dirty"]:
	831	rendered += ".dev0"
	832	return rendered
	833
	834
	835	def render_git_describe(pieces):
	836	"""TAG[-DISTANCE-gHEX][-dirty].
	837
	838	Like 'git describe --tags --dirty --always'.
	839
	840	Exceptions:
	841	1: no tags. HEX[-dirty] (note: no 'g' prefix)
	842	"""
	843	if pieces["closest-tag"]:
	844	rendered = pieces["closest-tag"]
	845	if pieces["distance"]:
	846	rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
	847	else:
	848	# exception #1
	849	rendered = pieces["short"]
	850	if pieces["dirty"]:
	851	rendered += "-dirty"
	852	return rendered
	853
	854
	855	def render_git_describe_long(pieces):
	856	"""TAG-DISTANCE-gHEX[-dirty].
	857
	858	Like 'git describe --tags --dirty --always -long'.
	859	The distance/hash is unconditional.
	860
	861	Exceptions:
	862	1: no tags. HEX[-dirty] (note: no 'g' prefix)
	863	"""
	864	if pieces["closest-tag"]:
	865	rendered = pieces["closest-tag"]
	866	rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
	867	else:
	868	# exception #1
	869	rendered = pieces["short"]
	870	if pieces["dirty"]:
	871	rendered += "-dirty"
	872	return rendered
	873
	874
	875	def render(pieces, style):
	876	"""Render the given version pieces into the requested style."""
	877	if pieces["error"]:
	878	return {"version": "unknown",
	879	"full-revisionid": pieces.get("long"),
	880	"dirty": None,
	881	"error": pieces["error"],
	882	"date": None}
	883
	884	if not style or style == "default":
	885	style = "pep440" # the default
	886
	887	if style == "pep440":
	888	rendered = render_pep440(pieces)
	889	elif style == "pep440-pre":
	890	rendered = render_pep440_pre(pieces)
	891	elif style == "pep440-post":
	892	rendered = render_pep440_post(pieces)
	893	elif style == "pep440-old":
	894	rendered = render_pep440_old(pieces)
	895	elif style == "git-describe":
	896	rendered = render_git_describe(pieces)
	897	elif style == "git-describe-long":
	898	rendered = render_git_describe_long(pieces)
	899	else:
	900	raise ValueError("unknown style '%%s'" %% style)
	901
	902	return {"version": rendered, "full-revisionid": pieces["long"],
	903	"dirty": pieces["dirty"], "error": None,
	904	"date": pieces.get("date")}
	905
	906
	907	def get_versions():
	908	"""Get version information or return default if unable to do so."""
	909	# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
	910	# __file__, we can work backwards from there to the root. Some
	911	# py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
	912	# case we can only use expanded keywords.
	913
	914	cfg = get_config()
	915	verbose = cfg.verbose
	916
	917	try:
	918	return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
	919	verbose)
	920	except NotThisMethod:
	921	pass
	922
	923	try:
	924	root = os.path.realpath(__file__)
	925	# versionfile_source is the relative path from the top of the source
	926	# tree (where the .git directory might live) to this file. Invert
	927	# this to find the root from __file__.
	928	for i in cfg.versionfile_source.split('/'):
	929	root = os.path.dirname(root)
	930	except NameError:
	931	return {"version": "0+unknown", "full-revisionid": None,
	932	"dirty": None,
	933	"error": "unable to find root of source tree",
	934	"date": None}
	935
	936	try:
	937	pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
	938	return render(pieces, cfg.style)
	939	except NotThisMethod:
	940	pass
	941
	942	try:
	943	if cfg.parentdir_prefix:
	944	return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
	945	except NotThisMethod:
	946	pass
	947
	948	return {"version": "0+unknown", "full-revisionid": None,
	949	"dirty": None,
	950	"error": "unable to compute version", "date": None}
	951	'''
	952
	953
	954	@register_vcs_handler("git", "get_keywords")
	955	def git_get_keywords(versionfile_abs):
	956	"""Extract version information from the given file."""
	957	# the code embedded in _version.py can just fetch the value of these
	958	# keywords. When used from setup.py, we don't want to import _version.py,
	959	# so we do it with a regexp instead. This function is not used from
	960	# _version.py.
	961	keywords = {}
	962	try:
	963	f = open(versionfile_abs, "r")
	964	for line in f.readlines():
	965	if line.strip().startswith("git_refnames ="):
	966	mo = re.search(r'=\s"(.)"', line)
	967	if mo:
	968	keywords["refnames"] = mo.group(1)
	969	if line.strip().startswith("git_full ="):
	970	mo = re.search(r'=\s"(.)"', line)
	971	if mo:
	972	keywords["full"] = mo.group(1)
	973	if line.strip().startswith("git_date ="):
	974	mo = re.search(r'=\s"(.)"', line)
	975	if mo:
	976	keywords["date"] = mo.group(1)
	977	f.close()
	978	except EnvironmentError:
	979	pass
	980	return keywords
	981
	982
	983	@register_vcs_handler("git", "keywords")
	984	def git_versions_from_keywords(keywords, tag_prefix, verbose):
	985	"""Get version information from git keywords."""
	986	if not keywords:
	987	raise NotThisMethod("no keywords at all, weird")
	988	date = keywords.get("date")
	989	if date is not None:
	990	# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
	991	# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
	992	# -like" string, which we must then edit to make compliant), because
	993	# it's been around since git-1.5.3, and it's too difficult to
	994	# discover which version we're using, or to work around using an
	995	# older one.
	996	date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
	997	refnames = keywords["refnames"].strip()
	998	if refnames.startswith("$Format"):
	999	if verbose:
	1000	print("keywords are unexpanded, not using")
	1001	raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
	1002	refs = set([r.strip() for r in refnames.strip("()").split(",")])
	1003	# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
	1004	# just "foo-1.0". If we see a "tag: " prefix, prefer those.
	1005	TAG = "tag: "
	1006	tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
	1007	if not tags:
	1008	# Either we're using git < 1.8.3, or there really are no tags. We use
	1009	# a heuristic: assume all version tags have a digit. The old git %d
	1010	# expansion behaves like git log --decorate=short and strips out the
	1011	# refs/heads/ and refs/tags/ prefixes that would let us distinguish
	1012	# between branches and tags. By ignoring refnames without digits, we
	1013	# filter out many common branch names like "release" and
	1014	# "stabilization", as well as "HEAD" and "master".
	1015	tags = set([r for r in refs if re.search(r"\d", r)])
	1016	if verbose:
	1017	print("discarding '%s', no digits" % ",".join(refs - tags))
	1018	if verbose:
	1019	print("likely tags: %s" % ",".join(sorted(tags)))
	1020	for ref in sorted(tags):
	1021	# sorting will prefer e.g. "2.0" over "2.0rc1"
	1022	if ref.startswith(tag_prefix):
	1023	r = ref[len(tag_prefix) :]
	1024	if verbose:
	1025	print("picking %s" % r)
	1026	return {
	1027	"version": r,
	1028	"full-revisionid": keywords["full"].strip(),
	1029	"dirty": False,
	1030	"error": None,
	1031	"date": date,
	1032	}
	1033	# no suitable tags, so version is "0+unknown", but full hex is still there
	1034	if verbose:
	1035	print("no suitable tags, using unknown + full revision id")
	1036	return {
	1037	"version": "0+unknown",
	1038	"full-revisionid": keywords["full"].strip(),
	1039	"dirty": False,
	1040	"error": "no suitable tags",
	1041	"date": None,
	1042	}
	1043
	1044
	1045	@register_vcs_handler("git", "pieces_from_vcs")
	1046	def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
	1047	"""Get version from 'git describe' in the root of the source tree.
	1048
	1049	This only gets called if the git-archive 'subst' keywords were not
	1050	expanded, and _version.py hasn't already been rewritten with a short
	1051	version string, meaning we're inside a checked out source tree.
	1052	"""
	1053	GITS = ["git"]
	1054	if sys.platform == "win32":
	1055	GITS = ["git.cmd", "git.exe"]
	1056
	1057	out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
	1058	if rc != 0:
	1059	if verbose:
	1060	print("Directory %s not under git control" % root)
	1061	raise NotThisMethod("'git rev-parse --git-dir' returned error")
	1062
	1063	# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
	1064	# if there isn't one, this yields HEX[-dirty] (no NUM)
	1065	describe_out, rc = run_command(
	1066	GITS,
	1067	[
	1068	"describe",
	1069	"--tags",
	1070	"--dirty",
	1071	"--always",
	1072	"--long",
	1073	"--match",
	1074	"%s*" % tag_prefix,
	1075	],
	1076	cwd=root,
	1077	)
	1078	# --long was added in git-1.5.5
	1079	if describe_out is None:
	1080	raise NotThisMethod("'git describe' failed")
	1081	describe_out = describe_out.strip()
	1082	full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
	1083	if full_out is None:
	1084	raise NotThisMethod("'git rev-parse' failed")
	1085	full_out = full_out.strip()
	1086
	1087	pieces = {}
	1088	pieces["long"] = full_out
	1089	pieces["short"] = full_out[:7] # maybe improved later
	1090	pieces["error"] = None
	1091
	1092	# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
	1093	# TAG might have hyphens.
	1094	git_describe = describe_out
	1095
	1096	# look for -dirty suffix
	1097	dirty = git_describe.endswith("-dirty")
	1098	pieces["dirty"] = dirty
	1099	if dirty:
	1100	git_describe = git_describe[: git_describe.rindex("-dirty")]
	1101
	1102	# now we have TAG-NUM-gHEX or HEX
	1103
	1104	if "-" in git_describe:
	1105	# TAG-NUM-gHEX
	1106	mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
	1107	if not mo:
	1108	# unparseable. Maybe git-describe is misbehaving?
	1109	pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
	1110	return pieces
	1111
	1112	# tag
	1113	full_tag = mo.group(1)
	1114	if not full_tag.startswith(tag_prefix):
	1115	if verbose:
	1116	fmt = "tag '%s' doesn't start with prefix '%s'"
	1117	print(fmt % (full_tag, tag_prefix))
	1118	pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
	1119	full_tag,
	1120	tag_prefix,
	1121	)
	1122	return pieces
	1123	pieces["closest-tag"] = full_tag[len(tag_prefix) :]
	1124
	1125	# distance: number of commits since tag
	1126	pieces["distance"] = int(mo.group(2))
	1127
	1128	# commit: short hex revision ID
	1129	pieces["short"] = mo.group(3)
	1130
	1131	else:
	1132	# HEX: no tags
	1133	pieces["closest-tag"] = None
	1134	count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
	1135	pieces["distance"] = int(count_out) # total number of commits
	1136
	1137	# commit date: see ISO-8601 comment in git_versions_from_keywords()
	1138	date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
	1139	0
	1140	].strip()
	1141	pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
	1142
	1143	return pieces
	1144
	1145
	1146	def do_vcs_install(manifest_in, versionfile_source, ipy):
	1147	"""Git-specific installation logic for Versioneer.
	1148
	1149	For Git, this means creating/changing .gitattributes to mark _version.py
	1150	for export-subst keyword substitution.
	1151	"""
	1152	GITS = ["git"]
	1153	if sys.platform == "win32":
	1154	GITS = ["git.cmd", "git.exe"]
	1155	files = [manifest_in, versionfile_source]
	1156	if ipy:
	1157	files.append(ipy)
	1158	try:
	1159	me = __file__
	1160	if me.endswith(".pyc") or me.endswith(".pyo"):
	1161	me = os.path.splitext(me)[0] + ".py"
	1162	versioneer_file = os.path.relpath(me)
	1163	except NameError:
	1164	versioneer_file = "versioneer.py"
	1165	files.append(versioneer_file)
	1166	present = False
	1167	try:
	1168	f = open(".gitattributes", "r")
	1169	for line in f.readlines():
	1170	if line.strip().startswith(versionfile_source):
	1171	if "export-subst" in line.strip().split()[1:]:
	1172	present = True
	1173	f.close()
	1174	except EnvironmentError:
	1175	pass
	1176	if not present:
	1177	f = open(".gitattributes", "a+")
	1178	f.write("%s export-subst\n" % versionfile_source)
	1179	f.close()
	1180	files.append(".gitattributes")
	1181	run_command(GITS, ["add", "--"] + files)
	1182
	1183
	1184	def versions_from_parentdir(parentdir_prefix, root, verbose):
	1185	"""Try to determine the version from the parent directory name.
	1186
	1187	Source tarballs conventionally unpack into a directory that includes both
	1188	the project name and a version string. We will also support searching up
	1189	two directory levels for an appropriately named parent directory
	1190	"""
	1191	rootdirs = []
	1192
	1193	for i in range(3):
	1194	dirname = os.path.basename(root)
	1195	if dirname.startswith(parentdir_prefix):
	1196	return {
	1197	"version": dirname[len(parentdir_prefix) :],
	1198	"full-revisionid": None,
	1199	"dirty": False,
	1200	"error": None,
	1201	"date": None,
	1202	}
	1203	else:
	1204	rootdirs.append(root)
	1205	root = os.path.dirname(root) # up a level
	1206
	1207	if verbose:
	1208	print(
	1209	"Tried directories %s but none started with prefix %s"
	1210	% (str(rootdirs), parentdir_prefix)
	1211	)
	1212	raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
	1213
	1214
	1215	SHORT_VERSION_PY = """
	1216	# This file was generated by 'versioneer.py' (0.18) from
	1217	# revision-control system data, or from the parent directory name of an
	1218	# unpacked source archive. Distribution tarballs contain a pre-generated copy
	1219	# of this file.
	1220
	1221	import json
	1222
	1223	version_json = '''
	1224	%s
	1225	''' # END VERSION_JSON
	1226
	1227
	1228	def get_versions():
	1229	return json.loads(version_json)
	1230	"""
	1231
	1232
	1233	def versions_from_file(filename):
	1234	"""Try to determine the version from _version.py if present."""
	1235	try:
	1236	with open(filename) as f:
	1237	contents = f.read()
	1238	except EnvironmentError:
	1239	raise NotThisMethod("unable to read _version.py")
	1240	mo = re.search(
	1241	r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M \| re.S
	1242	)
	1243	if not mo:
	1244	mo = re.search(
	1245	r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M \| re.S
	1246	)
	1247	if not mo:
	1248	raise NotThisMethod("no version_json in _version.py")
	1249	return json.loads(mo.group(1))
	1250
	1251
	1252	def write_to_version_file(filename, versions):
	1253	"""Write the given version number to the given _version.py file."""
	1254	os.unlink(filename)
	1255	contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": "))
	1256	with open(filename, "w") as f:
	1257	f.write(SHORT_VERSION_PY % contents)
	1258
	1259	print("set %s to '%s'" % (filename, versions["version"]))
	1260
	1261
	1262	def plus_or_dot(pieces):
	1263	"""Return a + if we don't already have one, else return a ."""
	1264	if "+" in pieces.get("closest-tag", ""):
	1265	return "."
	1266	return "+"
	1267
	1268
	1269	def render_pep440(pieces):
	1270	"""Build up version string, with post-release "local version identifier".
	1271
	1272	Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
	1273	get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
	1274
	1275	Exceptions:
	1276	1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
	1277	"""
	1278	if pieces["closest-tag"]:
	1279	rendered = pieces["closest-tag"]
	1280	if pieces["distance"] or pieces["dirty"]:
	1281	rendered += plus_or_dot(pieces)
	1282	rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
	1283	if pieces["dirty"]:
	1284	rendered += ".dirty"
	1285	else:
	1286	# exception #1
	1287	rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
	1288	if pieces["dirty"]:
	1289	rendered += ".dirty"
	1290	return rendered
	1291
	1292
	1293	def render_pep440_pre(pieces):
	1294	"""TAG[.post.devDISTANCE] -- No -dirty.
	1295
	1296	Exceptions:
	1297	1: no tags. 0.post.devDISTANCE
	1298	"""
	1299	if pieces["closest-tag"]:
	1300	rendered = pieces["closest-tag"]
	1301	if pieces["distance"]:
	1302	rendered += ".post.dev%d" % pieces["distance"]
	1303	else:
	1304	# exception #1
	1305	rendered = "0.post.dev%d" % pieces["distance"]
	1306	return rendered
	1307
	1308
	1309	def render_pep440_post(pieces):
	1310	"""TAG[.postDISTANCE[.dev0]+gHEX] .
	1311
	1312	The ".dev0" means dirty. Note that .dev0 sorts backwards
	1313	(a dirty tree will appear "older" than the corresponding clean one),
	1314	but you shouldn't be releasing software with -dirty anyways.
	1315
	1316	Exceptions:
	1317	1: no tags. 0.postDISTANCE[.dev0]
	1318	"""
	1319	if pieces["closest-tag"]:
	1320	rendered = pieces["closest-tag"]
	1321	if pieces["distance"] or pieces["dirty"]:
	1322	rendered += ".post%d" % pieces["distance"]
	1323	if pieces["dirty"]:
	1324	rendered += ".dev0"
	1325	rendered += plus_or_dot(pieces)
	1326	rendered += "g%s" % pieces["short"]
	1327	else:
	1328	# exception #1
	1329	rendered = "0.post%d" % pieces["distance"]
	1330	if pieces["dirty"]:
	1331	rendered += ".dev0"
	1332	rendered += "+g%s" % pieces["short"]
	1333	return rendered
	1334
	1335
	1336	def render_pep440_old(pieces):
	1337	"""TAG[.postDISTANCE[.dev0]] .
	1338
	1339	The ".dev0" means dirty.
	1340
	1341	Eexceptions:
	1342	1: no tags. 0.postDISTANCE[.dev0]
	1343	"""
	1344	if pieces["closest-tag"]:
	1345	rendered = pieces["closest-tag"]
	1346	if pieces["distance"] or pieces["dirty"]:
	1347	rendered += ".post%d" % pieces["distance"]
	1348	if pieces["dirty"]:
	1349	rendered += ".dev0"
	1350	else:
	1351	# exception #1
	1352	rendered = "0.post%d" % pieces["distance"]
	1353	if pieces["dirty"]:
	1354	rendered += ".dev0"
	1355	return rendered
	1356
	1357
	1358	def render_git_describe(pieces):
	1359	"""TAG[-DISTANCE-gHEX][-dirty].
	1360
	1361	Like 'git describe --tags --dirty --always'.
	1362
	1363	Exceptions:
	1364	1: no tags. HEX[-dirty] (note: no 'g' prefix)
	1365	"""
	1366	if pieces["closest-tag"]:
	1367	rendered = pieces["closest-tag"]
	1368	if pieces["distance"]:
	1369	rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
	1370	else:
	1371	# exception #1
	1372	rendered = pieces["short"]
	1373	if pieces["dirty"]:
	1374	rendered += "-dirty"
	1375	return rendered
	1376
	1377
	1378	def render_git_describe_long(pieces):
	1379	"""TAG-DISTANCE-gHEX[-dirty].
	1380
	1381	Like 'git describe --tags --dirty --always -long'.
	1382	The distance/hash is unconditional.
	1383
	1384	Exceptions:
	1385	1: no tags. HEX[-dirty] (note: no 'g' prefix)
	1386	"""
	1387	if pieces["closest-tag"]:
	1388	rendered = pieces["closest-tag"]
	1389	rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
	1390	else:
	1391	# exception #1
	1392	rendered = pieces["short"]
	1393	if pieces["dirty"]:
	1394	rendered += "-dirty"
	1395	return rendered
	1396
	1397
	1398	def render(pieces, style):
	1399	"""Render the given version pieces into the requested style."""
	1400	if pieces["error"]:
	1401	return {
	1402	"version": "unknown",
	1403	"full-revisionid": pieces.get("long"),
	1404	"dirty": None,
	1405	"error": pieces["error"],
	1406	"date": None,
	1407	}
	1408
	1409	if not style or style == "default":
	1410	style = "pep440" # the default
	1411
	1412	if style == "pep440":
	1413	rendered = render_pep440(pieces)
	1414	elif style == "pep440-pre":
	1415	rendered = render_pep440_pre(pieces)
	1416	elif style == "pep440-post":
	1417	rendered = render_pep440_post(pieces)
	1418	elif style == "pep440-old":
	1419	rendered = render_pep440_old(pieces)
	1420	elif style == "git-describe":
	1421	rendered = render_git_describe(pieces)
	1422	elif style == "git-describe-long":
	1423	rendered = render_git_describe_long(pieces)
	1424	else:
	1425	raise ValueError("unknown style '%s'" % style)
	1426
	1427	return {
	1428	"version": rendered,
	1429	"full-revisionid": pieces["long"],
	1430	"dirty": pieces["dirty"],
	1431	"error": None,
	1432	"date": pieces.get("date"),
	1433	}
	1434
	1435
	1436	class VersioneerBadRootError(Exception):
	1437	"""The project root directory is unknown or missing key files."""
	1438
	1439
	1440	def get_versions(verbose=False):
	1441	"""Get the project version from whatever source is available.
	1442
	1443	Returns dict with two keys: 'version' and 'full'.
	1444	"""
	1445	if "versioneer" in sys.modules:
	1446	# see the discussion in cmdclass.py:get_cmdclass()
	1447	del sys.modules["versioneer"]
	1448
	1449	root = get_root()
	1450	cfg = get_config_from_root(root)
	1451
	1452	assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
	1453	handlers = HANDLERS.get(cfg.VCS)
	1454	assert handlers, "unrecognized VCS '%s'" % cfg.VCS
	1455	verbose = verbose or cfg.verbose
	1456	assert (
	1457	cfg.versionfile_source is not None
	1458	), "please set versioneer.versionfile_source"
	1459	assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
	1460
	1461	versionfile_abs = os.path.join(root, cfg.versionfile_source)
	1462
	1463	# extract version from first of: _version.py, VCS command (e.g. 'git
	1464	# describe'), parentdir. This is meant to work for developers using a
	1465	# source checkout, for users of a tarball created by 'setup.py sdist',
	1466	# and for users of a tarball/zipball created by 'git archive' or github's
	1467	# download-from-tag feature or the equivalent in other VCSes.
	1468
	1469	get_keywords_f = handlers.get("get_keywords")
	1470	from_keywords_f = handlers.get("keywords")
	1471	if get_keywords_f and from_keywords_f:
	1472	try:
	1473	keywords = get_keywords_f(versionfile_abs)
	1474	ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
	1475	if verbose:
	1476	print("got version from expanded keyword %s" % ver)
	1477	return ver
	1478	except NotThisMethod:
	1479	pass
	1480
	1481	try:
	1482	ver = versions_from_file(versionfile_abs)
	1483	if verbose:
	1484	print("got version from file %s %s" % (versionfile_abs, ver))
	1485	return ver
	1486	except NotThisMethod:
	1487	pass
	1488
	1489	from_vcs_f = handlers.get("pieces_from_vcs")
	1490	if from_vcs_f:
	1491	try:
	1492	pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
	1493	ver = render(pieces, cfg.style)
	1494	if verbose:
	1495	print("got version from VCS %s" % ver)
	1496	return ver
	1497	except NotThisMethod:
	1498	pass
	1499
	1500	try:
	1501	if cfg.parentdir_prefix:
	1502	ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
	1503	if verbose:
	1504	print("got version from parentdir %s" % ver)
	1505	return ver
	1506	except NotThisMethod:
	1507	pass
	1508
	1509	if verbose:
	1510	print("unable to compute version")
	1511
	1512	return {
	1513	"version": "0+unknown",
	1514	"full-revisionid": None,
	1515	"dirty": None,
	1516	"error": "unable to compute version",
	1517	"date": None,
	1518	}
	1519
	1520
	1521	def get_version():
	1522	"""Get the short version string for this project."""
	1523	return get_versions()["version"]
	1524
	1525
	1526	def get_cmdclass():
	1527	"""Get the custom setuptools/distutils subclasses used by Versioneer."""
	1528	if "versioneer" in sys.modules:
	1529	del sys.modules["versioneer"]
	1530	# this fixes the "python setup.py develop" case (also 'install' and
	1531	# 'easy_install .'), in which subdependencies of the main project are
	1532	# built (using setup.py bdist_egg) in the same python process. Assume
	1533	# a main project A and a dependency B, which use different versions
	1534	# of Versioneer. A's setup.py imports A's Versioneer, leaving it in
	1535	# sys.modules by the time B's setup.py is executed, causing B to run
	1536	# with the wrong versioneer. Setuptools wraps the sub-dep builds in a
	1537	# sandbox that restores sys.modules to it's pre-build state, so the
	1538	# parent is protected against the child's "import versioneer". By
	1539	# removing ourselves from sys.modules here, before the child build
	1540	# happens, we protect the child from the parent's versioneer too.
	1541	# Also see https://github.com/warner/python-versioneer/issues/52
	1542
	1543	cmds = {}
	1544
	1545	# we add "version" to both distutils and setuptools
	1546	from distutils.core import Command
	1547
	1548	class cmd_version(Command):
	1549	description = "report generated version string"
	1550	user_options = []
	1551	boolean_options = []
	1552
	1553	def initialize_options(self):
	1554	pass
	1555
	1556	def finalize_options(self):
	1557	pass
	1558
	1559	def run(self):
	1560	vers = get_versions(verbose=True)
	1561	print("Version: %s" % vers["version"])
	1562	print(" full-revisionid: %s" % vers.get("full-revisionid"))
	1563	print(" dirty: %s" % vers.get("dirty"))
	1564	print(" date: %s" % vers.get("date"))
	1565	if vers["error"]:
	1566	print(" error: %s" % vers["error"])
	1567
	1568	cmds["version"] = cmd_version
	1569
	1570	# we override "build_py" in both distutils and setuptools
	1571	#
	1572	# most invocation pathways end up running build_py:
	1573	# distutils/build -> build_py
	1574	# distutils/install -> distutils/build ->..
	1575	# setuptools/bdist_wheel -> distutils/install ->..
	1576	# setuptools/bdist_egg -> distutils/install_lib -> build_py
	1577	# setuptools/install -> bdist_egg ->..
	1578	# setuptools/develop -> ?
	1579	# pip install:
	1580	# copies source tree to a tempdir before running egg_info/etc
	1581	# if .git isn't copied too, 'git describe' will fail
	1582	# then does setup.py bdist_wheel, or sometimes setup.py install
	1583	# setup.py egg_info -> ?
	1584
	1585	# we override different "build_py" commands for both environments
	1586	if "setuptools" in sys.modules:
	1587	from setuptools.command.build_py import build_py as _build_py
	1588	else:
	1589	from distutils.command.build_py import build_py as _build_py
	1590
	1591	class cmd_build_py(_build_py):
	1592	def run(self):
	1593	root = get_root()
	1594	cfg = get_config_from_root(root)
	1595	versions = get_versions()
	1596	_build_py.run(self)
	1597	# now locate _version.py in the new build/ directory and replace
	1598	# it with an updated value
	1599	if cfg.versionfile_build:
	1600	target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
	1601	print("UPDATING %s" % target_versionfile)
	1602	write_to_version_file(target_versionfile, versions)
	1603
	1604	cmds["build_py"] = cmd_build_py
	1605
	1606	if "cx_Freeze" in sys.modules: # cx_freeze enabled?
	1607	from cx_Freeze.dist import build_exe as _build_exe
	1608
	1609	# nczeczulin reports that py2exe won't like the pep440-style string
	1610	# as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
	1611	# setup(console=[{
	1612	# "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
	1613	# "product_version": versioneer.get_version(),
	1614	# ...
	1615
	1616	class cmd_build_exe(_build_exe):
	1617	def run(self):
	1618	root = get_root()
	1619	cfg = get_config_from_root(root)
	1620	versions = get_versions()
	1621	target_versionfile = cfg.versionfile_source
	1622	print("UPDATING %s" % target_versionfile)
	1623	write_to_version_file(target_versionfile, versions)
	1624
	1625	_build_exe.run(self)
	1626	os.unlink(target_versionfile)
	1627	with open(cfg.versionfile_source, "w") as f:
	1628	LONG = LONG_VERSION_PY[cfg.VCS]
	1629	f.write(
	1630	LONG
	1631	% {
	1632	"DOLLAR": "$",
	1633	"STYLE": cfg.style,
	1634	"TAG_PREFIX": cfg.tag_prefix,
	1635	"PARENTDIR_PREFIX": cfg.parentdir_prefix,
	1636	"VERSIONFILE_SOURCE": cfg.versionfile_source,
	1637	}
	1638	)
	1639
	1640	cmds["build_exe"] = cmd_build_exe
	1641	del cmds["build_py"]
	1642
	1643	if "py2exe" in sys.modules: # py2exe enabled?
	1644	try:
	1645	from py2exe.distutils_buildexe import py2exe as _py2exe # py3
	1646	except ImportError:
	1647	from py2exe.build_exe import py2exe as _py2exe # py2
	1648
	1649	class cmd_py2exe(_py2exe):
	1650	def run(self):
	1651	root = get_root()
	1652	cfg = get_config_from_root(root)
	1653	versions = get_versions()
	1654	target_versionfile = cfg.versionfile_source
	1655	print("UPDATING %s" % target_versionfile)
	1656	write_to_version_file(target_versionfile, versions)
	1657
	1658	_py2exe.run(self)
	1659	os.unlink(target_versionfile)
	1660	with open(cfg.versionfile_source, "w") as f:
	1661	LONG = LONG_VERSION_PY[cfg.VCS]
	1662	f.write(
	1663	LONG
	1664	% {
	1665	"DOLLAR": "$",
	1666	"STYLE": cfg.style,
	1667	"TAG_PREFIX": cfg.tag_prefix,
	1668	"PARENTDIR_PREFIX": cfg.parentdir_prefix,
	1669	"VERSIONFILE_SOURCE": cfg.versionfile_source,
	1670	}
	1671	)
	1672
	1673	cmds["py2exe"] = cmd_py2exe
	1674
	1675	# we override different "sdist" commands for both environments
	1676	if "setuptools" in sys.modules:
	1677	from setuptools.command.sdist import sdist as _sdist
	1678	else:
	1679	from distutils.command.sdist import sdist as _sdist
	1680
	1681	class cmd_sdist(_sdist):
	1682	def run(self):
	1683	versions = get_versions()
	1684	self._versioneer_generated_versions = versions
	1685	# unless we update this, the command will keep using the old
	1686	# version
	1687	self.distribution.metadata.version = versions["version"]
	1688	return _sdist.run(self)
	1689
	1690	def make_release_tree(self, base_dir, files):
	1691	root = get_root()
	1692	cfg = get_config_from_root(root)
	1693	_sdist.make_release_tree(self, base_dir, files)
	1694	# now locate _version.py in the new base_dir directory
	1695	# (remembering that it may be a hardlink) and replace it with an
	1696	# updated value
	1697	target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
	1698	print("UPDATING %s" % target_versionfile)
	1699	write_to_version_file(
	1700	target_versionfile, self._versioneer_generated_versions
	1701	)
	1702
	1703	cmds["sdist"] = cmd_sdist
	1704
	1705	return cmds
	1706
	1707
	1708	CONFIG_ERROR = """
	1709	setup.cfg is missing the necessary Versioneer configuration. You need
	1710	a section like:
	1711
	1712	[versioneer]
	1713	VCS = git
	1714	style = pep440
	1715	versionfile_source = src/myproject/_version.py
	1716	versionfile_build = myproject/_version.py
	1717	tag_prefix =
	1718	parentdir_prefix = myproject-
	1719
	1720	You will also need to edit your setup.py to use the results:
	1721
	1722	import versioneer
	1723	setup(version=versioneer.get_version(),
	1724	cmdclass=versioneer.get_cmdclass(), ...)
	1725
	1726	Please read the docstring in ./versioneer.py for configuration instructions,
	1727	edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
	1728	"""
	1729
	1730	SAMPLE_CONFIG = """
	1731	# See the docstring in versioneer.py for instructions. Note that you must
	1732	# re-run 'versioneer.py setup' after changing this section, and commit the
	1733	# resulting files.
	1734
	1735	[versioneer]
	1736	#VCS = git
	1737	#style = pep440
	1738	#versionfile_source =
	1739	#versionfile_build =
	1740	#tag_prefix =
	1741	#parentdir_prefix =
	1742
	1743	"""
	1744
	1745	INIT_PY_SNIPPET = """
	1746	from ._version import get_versions
	1747	__version__ = get_versions()['version']
	1748	del get_versions
	1749	"""
	1750
	1751
	1752	def do_setup():
	1753	"""Main VCS-independent setup function for installing Versioneer."""
	1754	root = get_root()
	1755	try:
	1756	cfg = get_config_from_root(root)
	1757	except (
	1758	EnvironmentError,
	1759	configparser.NoSectionError,
	1760	configparser.NoOptionError,
	1761	) as e:
	1762	if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
	1763	print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
	1764	with open(os.path.join(root, "setup.cfg"), "a") as f:
	1765	f.write(SAMPLE_CONFIG)
	1766	print(CONFIG_ERROR, file=sys.stderr)
	1767	return 1
	1768
	1769	print(" creating %s" % cfg.versionfile_source)
	1770	with open(cfg.versionfile_source, "w") as f:
	1771	LONG = LONG_VERSION_PY[cfg.VCS]
	1772	f.write(
	1773	LONG
	1774	% {
	1775	"DOLLAR": "$",
	1776	"STYLE": cfg.style,
	1777	"TAG_PREFIX": cfg.tag_prefix,
	1778	"PARENTDIR_PREFIX": cfg.parentdir_prefix,
	1779	"VERSIONFILE_SOURCE": cfg.versionfile_source,
	1780	}
	1781	)
	1782
	1783	ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
	1784	if os.path.exists(ipy):
	1785	try:
	1786	with open(ipy, "r") as f:
	1787	old = f.read()
	1788	except EnvironmentError:
	1789	old = ""
	1790	if INIT_PY_SNIPPET not in old:
	1791	print(" appending to %s" % ipy)
	1792	with open(ipy, "a") as f:
	1793	f.write(INIT_PY_SNIPPET)
	1794	else:
	1795	print(" %s unmodified" % ipy)
	1796	else:
	1797	print(" %s doesn't exist, ok" % ipy)
	1798	ipy = None
	1799
	1800	# Make sure both the top-level "versioneer.py" and versionfile_source
	1801	# (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
	1802	# they'll be copied into source distributions. Pip won't be able to
	1803	# install the package without this.
	1804	manifest_in = os.path.join(root, "MANIFEST.in")
	1805	simple_includes = set()
	1806	try:
	1807	with open(manifest_in, "r") as f:
	1808	for line in f:
	1809	if line.startswith("include "):
	1810	for include in line.split()[1:]:
	1811	simple_includes.add(include)
	1812	except EnvironmentError:
	1813	pass
	1814	# That doesn't cover everything MANIFEST.in can do
	1815	# (http://docs.python.org/2/distutils/sourcedist.html#commands), so
	1816	# it might give some false negatives. Appending redundant 'include'
	1817	# lines is safe, though.
	1818	if "versioneer.py" not in simple_includes:
	1819	print(" appending 'versioneer.py' to MANIFEST.in")
	1820	with open(manifest_in, "a") as f:
	1821	f.write("include versioneer.py\n")
	1822	else:
	1823	print(" 'versioneer.py' already in MANIFEST.in")
	1824	if cfg.versionfile_source not in simple_includes:
	1825	print(
	1826	" appending versionfile_source ('%s') to MANIFEST.in"
	1827	% cfg.versionfile_source
	1828	)
	1829	with open(manifest_in, "a") as f:
	1830	f.write("include %s\n" % cfg.versionfile_source)
	1831	else:
	1832	print(" versionfile_source already in MANIFEST.in")
	1833
	1834	# Make VCS-specific changes. For git, this means creating/changing
	1835	# .gitattributes to mark _version.py for export-subst keyword
	1836	# substitution.
	1837	do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
	1838	return 0
	1839
	1840
	1841	def scan_setup_py():
	1842	"""Validate the contents of setup.py against Versioneer's expectations."""
	1843	found = set()
	1844	setters = False
	1845	errors = 0
	1846	with open("setup.py", "r") as f:
	1847	for line in f.readlines():
	1848	if "import versioneer" in line:
	1849	found.add("import")
	1850	if "versioneer.get_cmdclass()" in line:
	1851	found.add("cmdclass")
	1852	if "versioneer.get_version()" in line:
	1853	found.add("get_version")
	1854	if "versioneer.VCS" in line:
	1855	setters = True
	1856	if "versioneer.versionfile_source" in line:
	1857	setters = True
	1858	if len(found) != 3:
	1859	print("")
	1860	print("Your setup.py appears to be missing some important items")
	1861	print("(but I might be wrong). Please make sure it has something")
	1862	print("roughly like the following:")
	1863	print("")
	1864	print(" import versioneer")
	1865	print(" setup( version=versioneer.get_version(),")
	1866	print(" cmdclass=versioneer.get_cmdclass(), ...)")
	1867	print("")
	1868	errors += 1
	1869	if setters:
	1870	print("You should remove lines like 'versioneer.VCS = ' and")
	1871	print("'versioneer.versionfile_source = ' . This configuration")
	1872	print("now lives in setup.cfg, and should be removed from setup.py")
	1873	print("")
	1874	errors += 1
	1875	return errors
	1876
	1877
	1878	if __name__ == "__main__":
	1879	cmd = sys.argv[1]
	1880	if cmd == "setup":
	1881	errors = do_setup()
	1882	errors += scan_setup_py()
	1883	if errors:
	1884	sys.exit(1)