diff --git a/.coveragerc b/.coveragerc index 5c5f643..64ca2b0 100644 --- a/.coveragerc +++ b/.coveragerc @@ -14,4 +14,3 @@ if __name__ == .__main__.: ignore_errors = True - diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..5d1335a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,23 @@ +--- +name: Bug report +about: Report unexpected behavior, a crash, or incorrect results + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Environment (please complete the following information):** + - Python Version: [e.g. 3.6] + - OS [e.g. Windows, Fedora] + - If the bug involves `LOCALE` or `humansorted`: + - Is `PyICU` installed? + - Do you have a locale set? If so, to what? + +**To Reproduce** +Include a Minimum, Complete, Verifiable Example. If there is a traceback (or error message), **please** include the *entire* traceback (or error message), even if you think it is too big. + +See https://stackoverflow.com/help/mcve for an explanation. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..5024e49 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,14 @@ +--- +name: Feature request +about: Suggest or request an enhancement + +--- + +**Describe the feature or enhancement** +Be as descriptive and precise as possible. + +**Provide a concrete example of how the feature or enhancement will improve `natsort`** +Code examples are an excellent way to show how this feature or enhancement will help. To make your case stronger, show the current workaround due to the lack of the feature. What is the return-on-investment for including the feature or enhancement? + +**Would you be willing to submit a Pull Request for this feature?** +Extra help is *always* welcome. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..adaf5c5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,7 @@ +--- +name: Question +about: Inquiry about natsort + +--- + +- [ ] I have read the [`natsort` documentation](https://natsort.readthedocs.io/en/master/) and the [README](https://github.com/SethMMorton/natsort#natsort), and my question is still not answered diff --git a/.travis.yml b/.travis.yml index 5d113de..7ff0718 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,13 @@ +dist: xenial +sudo: false language: python +cache: pip jobs: include: - python: "2.7" - dist: trusty - sudo: false env: WITH_EXTRAS="" - python: "2.7" - dist: trusty - sudo: false env: WITH_EXTRAS="fast,icu" addons: apt: @@ -17,20 +16,12 @@ - language-pack-de - language-pack-en - python: "3.4" - dist: trusty - sudo: false env: WITH_EXTRAS="" - python: "3.5" - dist: trusty - sudo: false env: WITH_EXTRAS="" - python: "3.6" - dist: trusty - sudo: false env: WITH_EXTRAS="" - python: "3.6" - dist: trusty - sudo: false env: WITH_EXTRAS="fast,icu" addons: apt: @@ -39,15 +30,15 @@ - language-pack-de - language-pack-en - python: "3.7" - dist: xenial - sudo: true env: WITH_EXTRAS="" - stage: code-quality python: "3.6" - dist: trusty - sudo: false - install: pip install flake8 flake8-import-order flake8-bugbear pep8-naming - script: flake8 + install: pip install flake8 flake8-import-order flake8-bugbear pep8-naming twine check-manifest + script: + - flake8 + - check-manifest --ignore ".github*,*.md,.coveragerc" + - python setup.py sdist + - twine check dist/* install: - pip install -U pip diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 0000000..eb5712b --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,382 @@ +02-04-2019 v. 6.0.0 ++++++++++++++++++++ + + - Drop support for Python 2.6 and 3.3 (thanks @jdufresne) (issue #70) + - Remove deprecated APIs (kwargs number_type, signed, exp, as_path, py3_safe; enums ns.TYPESAFE, ns.DIGIT, ns.VERSION; functions versorted, index_versorted) (issue #81) + - Remove pipenv as a dependency for building (issue #86) + - Simply Travis-CI configuration (thanks @jdufresne) (issue #88) + - Fix README rendering in PyPI (thanks @altendky) (issue #89) + +11-18-2018 v. 5.5.0 ++++++++++++++++++++ + + - Formally deprecated old or misleading APIs (issue #83) + - Documentation, packaging, and CI cleanup (thanks @jdufresne) (issues #69, #71-#80) + - Consolidate API documentation into a single page (issue #82) + - Add a CHANGELOG.rst to the top-level of the repository (issue #85) + - Add back support for very old versions of setuptools (issue #84) + +09-09-2018 v. 5.4.1 ++++++++++++++++++++ + + - Fix error in a newly added test (issues #65, #67) + - Changed code format and quality checking infrastructure (issue #68) + +09-06-2018 v. 5.4.0 ++++++++++++++++++++ + + - Re-expose ``natsort_key`` as "public" and remove the + associated ``DepricationWarning`` + - Add better developer documentation + - Refactor tests (issue #66) + - Bump allowed ``fastnumbers`` version + +07-07-2018 v. 5.3.3 ++++++++++++++++++++ + + - Update docs with a FAQ and quick how-it-works (issue #60) + - Fix a StopIteration error in the testing code + - Enable Python 3.7 support in Travis-CI (issue #61) + +05-17-2018 v. 5.3.2 ++++++++++++++++++++ + + - Fix bug that prevented install on old versions of setuptools (issues #55, #56) + - Revert layout from src/natsort/ back to natsort/ to make user + testing simpler (issues #57, #58) + +05-14-2018 v. 5.3.1 ++++++++++++++++++++ + + - No bugfixes or features, just infrastructure and installation updates + - Move to defining dependencies with Pipfile + - Development layout is now src/natsort/ instead of natsort/ + - Add bumpversion infrastructure + - Extras can be installed by "[]" notation + +04-20-2018 v. 5.3.0 ++++++++++++++++++++ + + - Fix bug in assessing ``fastnumbers`` version at import-time (thanks @hholzgra) (issues #51, #53) + - Add ability to consider unicode-decimal numbers as numbers (issues #52, #54) + +02-14-2018 v. 5.2.0 ++++++++++++++++++++ + + - Add ``ns.NUMAFTER`` to cause numbers to be placed after non-numbers (issues #48, #49) + - Add ``natcmp`` function (Python 2 only) (thanks @rinslow) (issue #47) + +11-11-2017 v. 5.1.1 ++++++++++++++++++++ + + - Added additional unicode number support for Python 3.7 + - Added information on how to install and test (issue #46) + +08-19-2017 v. 5.1.0 ++++++++++++++++++++ + + - Fixed ``StopIteration`` warning on Python 3.6+ (thanks @lykinsbd) (issues #42, #43) + - All Unicode input is now normalized (issue #44, #45) + +04-30-2017 v. 5.0.3 ++++++++++++++++++++ + + - Improved development infrastructure + - Migrated documentation to ReadTheDocs + +01-02-2017 v. 5.0.2 ++++++++++++++++++++ + + - Added additional unicode number support for Python 3.6 + - Renamed several internal functions and variables to improve clarity + - Improved documentation examples + - Added a "how does it work?" section to the documentation + +06-04-2016 v. 5.0.1 ++++++++++++++++++++ + + - The ``ns`` enum attributes can now be imported from the top-level + namespace + - Fixed a bug with the ``from natsort import *`` mechanism + - Fixed bug with using ``natsort`` with ``python -OO`` (issues #38, #39) + +05-08-2016 v. 5.0.0 ++++++++++++++++++++ + + - ``ns.LOCALE``/``humansorted`` now accounts for thousands separators (issue #36) + - Refactored entire codebase to be more functional (as in use functions as + units). Previously, the code was rather monolithic and difficult to follow. The + goal is that with the code existing in smaller units, contributing will + be easier (issue #37) + - Deprecated ``ns.TYPESAFE`` option as it is now always on (due to a new + iterator-based algorithm, the typesafe function is now cheap) + - Increased speed of execution (came for free with the new functional approach + because the new factory function paradigm eliminates most ``if`` branches + during execution) + + - For the most cases, the code is 30-40% faster than version 4.0.4 + - If using ``ns.LOCALE`` or ``humansorted``, the code is 1100% faster than + version 4.0.4 + + - Improved clarity of documentaion with regards to locale-aware sorting + - Added a new ``chain_functions`` function for convenience in creating + a complex user-given ``key`` from several existing functions + +11-01-2015 v. 4.0.4 ++++++++++++++++++++ + + - Improved coverage of unit tests + - Unit tests use new and improved hypothesis library + - Fixed compatibility issues with Python 3.5 + +06-25-2015 v. 4.0.3 ++++++++++++++++++++ + + - Fixed bad install on last release (sorry guys!) (issue #30) + +06-24-2015 v. 4.0.2 ++++++++++++++++++++ + + - Added back Python 2.6 and Python 3.2 compatibility. Unit testing is now + performed for these versions (thanks @dpetzold) (issue #29) + - Consolidated under-the-hood compatibility functionality + +06-04-2015 v. 4.0.1 ++++++++++++++++++++ + + - Added support for sorting NaN by internally converting to -Infinity + or +Infinity (issue #27) + +05-17-2015 v. 4.0.0 ++++++++++++++++++++ + + - Made default behavior of 'natsort' search for unsigned ints, + rather than signed floats. This is a backwards-incompatible + change but in 99% of use cases it should not require any + end-user changes (issue #20) + - Improved handling of locale-aware sorting on systems where the + underlying locale library is broken (issue #34)) + - Greatly improved all unit tests by adding the hypothesis library + +04-06-2015 v. 3.5.6 ++++++++++++++++++++ + + - Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior of + an ordinal sort when using 'LOCALE' (issue #23) + - Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for + dealing with bytes types + +04-04-2015 v. 3.5.5 ++++++++++++++++++++ + + - Added 'realsorted' and 'index_realsorted' functions for + forward-compatibility with >= 4.0.0 + - Made explanation of when to use "TYPESAFE" more clear in the docs + +04-02-2015 v. 3.5.4 ++++++++++++++++++++ + + - Fixed bug where a 'TypeError' was raised if a string containing a leading + number was sorted with alpha-only strings when 'LOCALE' is used (issue #22) + +03-26-2015 v. 3.5.3 ++++++++++++++++++++ + + - Fixed bug where '--reverse-filter' option in shell script was not + getting checked for correctness + - Documentation updates to better describe locale bug, and illustrate + upcoming default behavior change + - Internal improvements, including making test suite more granular + +01-13-2015 v. 3.5.2 ++++++++++++++++++++ + + - Enhancement that will convert a 'pathlib.Path' object to a 'str' if + 'ns.PATH' is enabled (issue #16) + +09-25-2014 v. 3.5.1 ++++++++++++++++++++ + + - Fixed bug that caused list/tuples to fail when using 'ns.LOWECASEFIRST' + or 'ns.IGNORECASE' (issue #15) + - Refactored modules so that only the public API was in natsort.py and + ns_enum.py + - Refactored all import statements to be absolute, not relative + + +09-02-2014 v. 3.5.0 ++++++++++++++++++++ + + - Added the 'alg' argument to the 'natsort' functions. This argument + accepts an enum that is used to indicate the options the user wishes + to use. The 'number_type', 'signed', 'exp', 'as_path', and 'py3_safe' + options are being deprecated and will become (undocumented) + keyword-only options in natsort version 4.0.0 + - The user can now modify how 'natsort' handles the case of non-numeric + characters (issue #14) + - The user can now instruct 'natsort' to use locale-aware sorting, which + allows 'natsort' to perform true "human sorting" (issue #14) + + - The `humansorted` convenience function has been included to make this + easier + + - Updated shell script with locale functionality + +08-12-2014 v. 3.4.1 ++++++++++++++++++++ + + - 'natsort' will now use the 'fastnumbers' module if it is installed. This + gives up to an extra 30% boost in speed over the previous performance + enhancements + - Made documentation point to more 'natsort' resources, and also added a + new example in the examples section + +07-19-2014 v. 3.4.0 ++++++++++++++++++++ + + - Fixed a bug that caused user's options to the 'natsort_key' to not be + passed on to recursive calls of 'natsort_key' (issue #12) + - Added a 'natsort_keygen' function that will generate a wrapped version + of 'natsort_key' that is easier to call. 'natsort_key' is now set to + deprecate at natsort version 4.0.0 + - Added an 'as_path' option to 'natsorted' & co. that will try to treat + input strings as filepaths. This will help yield correct results for + OS-generated inputs like + ``['/p/q/o.x', '/p/q (1)/o.x', '/p/q (10)/o.x', '/p/q/o (1).x']`` (issue #3) + - Massive performance enhancements for string input (1.8x-2.0x), at the expense + of reduction in speed for numeric input (~2.0x) + + - This is a good compromise because the most common input will be strings, + not numbers, and sorting numbers still only takes 0.6x the time of sorting + strings. If you are sorting only numbers, you would use 'sorted' anyway + + - Added the 'order_by_index' function to help in using the output of + 'index_natsorted' and 'index_versorted' + - Added the 'reverse' option to 'natsorted' & co. to make it's API more + similar to the builtin 'sorted' + - Added more unit tests + - Added auxillary test code that helps in profiling and stress-testing + - Reworked the documentation, moving most of it to PyPI's hosting platform + - Added support for coveralls.io + - Entire codebase is now PyFlakes and PEP8 compliant + +06-28-2014 v. 3.3.0 ++++++++++++++++++++ + + - Added a 'versorted' method for more convenient sorting of versions (issue #11) + - Updated command-line tool --number_type option with 'version' and 'ver' + to make it more clear how to sort version numbers + - Moved unit-testing mechanism from being docstring-based to actual unit tests + in actual functions (issue #10) + + - This has provided the ability determine the coverage of the unit tests (99%) + - This also makes the pydoc documentation a bit more clear + + - Made docstrings for public functions mirror the README API + - Connected natsort development to Travis-CI to help ensure quality releases + +06-20-2014 v. 3.2.1 ++++++++++++++++++++ + + - Re-"Fixed" unorderable types issue on Python 3.x - this workaround + is for when the problem occurs in the middle of the string (issue #7 again) + +05-07-2014 v. 3.2.0 ++++++++++++++++++++ + + - "Fixed" unorderable types issue on Python 3.x with a workaround that + attempts to replicate the Python 2.x behavior by putting all the numbers + (or strings that begin with numbers) first (issue #7) + - Now explicitly excluding __pycache__ from releases by adding a prune statement + to MANIFEST.in + +05-05-2014 v. 3.1.2 ++++++++++++++++++++ + + - Added setup.cfg to support universal wheels (issue #6) + - Added Python 3.0 and Python 3.1 as requiring the argparse module + +03-01-2014 v. 3.1.1 ++++++++++++++++++++ + + - Added ability to sort lists of lists (issue #5) + - Cleaned up import statements + +01-20-2014 v. 3.1.0 ++++++++++++++++++++ + + - Added the ``signed`` and ``exp`` options to allow finer tuning of the sorting + - Entire codebase now works for both Python 2 and Python 3 without needing to run + ``2to3`` + - Updated all doctests + - Further simplified the ``natsort`` base code by removing unneeded functions. + - Simplified documentation where possible + - Improved the shell script code + + - Made the documentation less "path"-centric to make it clear it is not just + for sorting file paths + - Removed the filesystem-based options because these can be achieved better + though a pipeline + - Added doctests + - Added new options that correspond to ``signed`` and ``exp`` + - The user can now specify multiple numbers to exclude or multiple ranges + to filter by + +10-01-2013 v. 3.0.2 ++++++++++++++++++++ + + - Made float, int, and digit searching algorithms all share the same base function + - Fixed some outdated comments + - Made the ``__version__`` variable available when importing the module + +8-15-2013 v. 3.0.1 +++++++++++++++++++ + + - Added support for unicode strings (issue #2) + - Removed extraneous ``string2int`` function + - Fixed empty string removal function + +7-13-2013 v. 3.0.0 +++++++++++++++++++ + + - Added a ``number_type`` argument to the sorting functions to specify how + liberal to be when deciding what a number is + - Reworked the documentation + +6-25-2013 v. 2.2.0 +++++++++++++++++++ + + - Added ``key`` attribute to ``natsorted`` and ``index_natsorted`` so that + it mimics the functionality of the built-in ``sorted`` (issue #1) + - Added tests to reflect the new functionality, as well as tests demonstrating + how to get similar functionality using ``natsort_key`` + +12-5-2012 v. 2.1.0 +++++++++++++++++++ + + - Reorganized package + - Now using a platform independent shell script generator (entry_points + from distribute) + - Can now execute natsort from command line with ``python -m natsort`` + as well + +11-30-2012 v. 2.0.2 ++++++++++++++++++++ + + - Added the use_2to3 option to setup.py + - Added distribute_setup.py to the distribution + - Added dependency to the argparse module (for python2.6) + +11-21-2012 v. 2.0.1 ++++++++++++++++++++ + + - Reorganized directory structure + - Added tests into the natsort.py file iteself + +11-16-2012, v. 2.0.0 +++++++++++++++++++++ + + - Updated sorting algorithm to support floats (including exponentials) and + basic version number support + - Added better README documentation + - Added doctests diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 2d55f88..f122b4f 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -40,7 +40,7 @@ ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html][version] -[homepage]: http://contributor-covenant.org -[version]: http://contributor-covenant.org/version/1/4/ +[homepage]: https://www.contributor-covenant.org/ +[version]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c3de3de..05f8492 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,10 @@ If you have an idea for how to improve `natsort`, please contribute! It can be as simple as a bug fix or documentation update, or as complicated as a more -robust algorithm. +robust algorithm. Contributions that change the public API of +`natsort` will have to ensure that the library does not become +less usable after the contribution and is backwards-compatible (unless there is +a good reason not to be). I do not have strong opinions on how one should contribute, so I have copy/pasted some text verbatim from the diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md deleted file mode 100644 index ec73511..0000000 --- a/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,5 +0,0 @@ -## Minimum, Complete, Verifiable Example - -See https://stackoverflow.com/help/mcve for explanation. - -## Error message, Traceback, Desired behavior, Suggestion, Request, or Question diff --git a/MANIFEST.in b/MANIFEST.in index f5008fd..c385901 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,16 +1,9 @@ -include README.rst include LICENSE -include *.md -include *.sh -include Pipfile -include setup.py -include setup.cfg +include CHANGELOG.rst +include clean.sh +include dev-requirements.txt include tox.ini -include .travis.yml -include .coveragerc -include .gitignore -include .bumpversion.cfg graft docs graft natsort -graft test_natsort +graft tests global-exclude *.py[cod] __pycache__ *.so diff --git a/Pipfile b/Pipfile deleted file mode 100644 index be4c9a8..0000000 --- a/Pipfile +++ /dev/null @@ -1,10 +0,0 @@ -[dev-packages] -coverage = "*" -pytest = ">=3.5" -pytest-cov = "*" -pytest-mock = ">=1.1" -hypothesis = ">=3.8.0" -pytest-faulthandler = {version = "*", platform_python_implementation = "== 'CPython'"} - -# These packages are standard on newer python versions. -pathlib = {version = "*", python_version = "< '3.4'"} diff --git a/README.rst b/README.rst index 93897e9..2b3108e 100644 --- a/README.rst +++ b/README.rst @@ -23,17 +23,20 @@ - Source Code: https://github.com/SethMMorton/natsort - Downloads: https://pypi.org/project/natsort/ - - Documentation: http://natsort.readthedocs.io/ - - - `Examples and Recipes `_ - - `How Does Natsort Work? `_ - - `API `_ + - Documentation: https://natsort.readthedocs.io/ + + - `Examples and Recipes `_ + - `How Does Natsort Work? `_ + - `API `_ - `FAQ`_ - `Optional Dependencies`_ - `fastnumbers `_ >= 2.0.0 - `PyICU `_ >= 1.0.0 + +**NOTE**: Please see the `Deprecation Schedule`_ section for changes in +``natsort`` version 6.0.0 and in the upcoming version 7.0.0. Quick Description ----------------- @@ -42,7 +45,7 @@ sort algorithm sorts lexicographically, so you might not get the results that you expect: -.. code-block:: python +.. code-block:: pycon >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] >>> sorted(a) @@ -57,7 +60,7 @@ sorting based on meaning and not computer code point). Using ``natsorted`` is simple: -.. code-block:: python +.. code-block:: pycon >>> from natsort import natsorted >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] @@ -66,16 +69,16 @@ ``natsorted`` identifies numbers anywhere in a string and sorts them naturally. Below are some other things you can do with ``natsort`` -(also see the `examples `_ +(also see the `examples `_ for a quick start guide, or the -`api `_ for complete details). +`api `_ for complete details). **Note**: ``natsorted`` is designed to be a drop-in replacement for the built-in ``sorted`` function. Like ``sorted``, ``natsorted`` `does not sort in-place`. To sort a list and assign the output to the same variable, you must explicitly assign the output to a variable: -.. code-block:: python +.. code-block:: pycon >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] >>> natsorted(a) @@ -95,25 +98,30 @@ Sorting Versions ++++++++++++++++ -This is handled properly by default (as of ``natsort`` version >= 4.0.0): - -.. code-block:: python +``natsort`` does not actually *comprehend* version numbers. +It just so happens that the most common versioning schemes are designed to +work with standard natural sorting techniques; these schemes include +``MAJOR.MINOR``, ``MAJOR.MINOR.PATCH``, ``YEAR.MONTH.DAY``. If your data +conforms to a scheme like this, then it will work out-of-the-box with +``natsorted`` (as of ``natsort`` version >= 4.0.0): + +.. code-block:: pycon >>> a = ['version-1.9', 'version-2.0', 'version-1.11', 'version-1.10'] >>> natsorted(a) ['version-1.9', 'version-1.10', 'version-1.11', 'version-2.0'] -If you need to sort release candidates, please see -`this useful hack `_. +If you need to versions that use a more complicated scheme, please see +`these examples `_. Sorting by Real Numbers (i.e. Signed Floats) ++++++++++++++++++++++++++++++++++++++++++++ -This is useful in scientific data analysis and was +This is useful in scientific data analysis (and was the default behavior of ``natsorted`` for ``natsort`` -version < 4.0.0. Use the ``realsorted`` function: - -.. code-block:: python +version < 4.0.0). Use the ``realsorted`` function: + +.. code-block:: pycon >>> from natsort import realsorted, ns >>> # Note that when interpreting as signed floats, the below numbers are @@ -134,7 +142,7 @@ separator is accounted for in the number. This can be achieved with the ``humansorted`` function: -.. code-block:: python +.. code-block:: pycon >>> a = ['Apple', 'apple15', 'Banana', 'apple14,689', 'banana'] >>> natsorted(a) @@ -150,7 +158,7 @@ You may find you need to explicitly set the locale to get this to work (as shown in the example). -Please see `locale issues `_ and the +Please see `locale issues `_ and the `Optional Dependencies`_ section below before using the ``humansorted`` function. Further Customizing Natsort @@ -160,7 +168,7 @@ ``ns.LOCALE``, and ``ns.IGNORECASE``), you can combine the options using the bitwise OR operator (``|``). For example, -.. code-block:: python +.. code-block:: pycon >>> a = ['Apple', 'apple15', 'Banana', 'apple14,689', 'banana'] >>> natsorted(a, alg=ns.REAL | ns.LOCALE | ns.IGNORECASE) @@ -175,12 +183,12 @@ True All of the available customizations can be found in the documentation for -`the ns enum `_. +`the ns enum `_. You can also add your own custom transformation functions with the ``key`` argument. These can be used with ``alg`` if you wish. -.. code-block:: python +.. code-block:: pycon >>> a = ['apple2.50', '2.3apple'] >>> natsorted(a, key=lambda x: x.replace('apple', ''), alg=ns.REAL) @@ -192,7 +200,7 @@ You can mix and match ``int``, ``float``, and ``str`` (or ``unicode``) types when you sort: -.. code-block:: python +.. code-block:: pycon >>> a = ['4.5', 6, 2.0, '5', 'a'] >>> natsorted(a) @@ -206,7 +214,7 @@ ``natsort`` does not officially support the `bytes` type on Python 3, but convenience functions are provided that help you decode to `str` first: -.. code-block:: python +.. code-block:: pycon >>> from natsort import as_utf8 >>> a = [b'a', 14.0, 'b'] @@ -229,7 +237,7 @@ generate a custom sorting key to sort in-place using the ``list.sort`` method. -.. code-block:: python +.. code-block:: pycon >>> from natsort import natsort_keygen >>> natsort_key = natsort_keygen() @@ -248,9 +256,9 @@ - recursively descend into lists of lists - automatic unicode normalization of input data - - `controlling the case-sensitivity `_ - - `sorting file paths correctly `_ - - `allow custom sorting keys `_ + - `controlling the case-sensitivity `_ + - `sorting file paths correctly `_ + - `allow custom sorting keys `_ FAQ --- @@ -261,7 +269,7 @@ exactly what is being done with their input using this key - it is highly recommended to `look at this issue describing how to debug `_ for *how* to debug, and also to review the - `How Does Natsort Work? `_ + `How Does Natsort Work? `_ page for *why* ``natsort`` is doing that to your data. If you are trying to sort custom classes and running into trouble, please take a look at @@ -272,7 +280,7 @@ use the ``natsort`` key as part of your rich comparison operator definition. How *does* ``natsort`` work? - If you don't want to read `How Does Natsort Work? `_, + If you don't want to read `How Does Natsort Work? `_, here is a quick primer. ``natsort`` provides a `key function `_ @@ -282,7 +290,7 @@ key generator ``natsort.natsort_keygen()``. ``natsort.natsorted()`` is essentially a wrapper for the following code: - .. code-block:: python + .. code-block:: pycon >>> from natsort import natsort_keygen >>> natsort_key = natsort_keygen() @@ -316,13 +324,12 @@ ------------ ``natsort`` comes with a shell script called ``natsort``, or can also be called -from the command line with ``python -m natsort``. +from the command line with ``python -m natsort``. Requirements ------------ -``natsort`` requires Python version 2.6 or greater or Python 3.3 or greater. -It may run on (but is not tested against) Python 3.2. +``natsort`` requires Python version 2.7 or Python 3.4 or greater. Optional Dependencies --------------------- @@ -344,14 +351,14 @@ It is recommended that you install `PyICU `_ if you wish to sort in a locale-dependent manner, see -http://natsort.readthedocs.io/en/master/locale_issues.html for an explanation why. +https://natsort.readthedocs.io/en/master/locale_issues.html for an explanation why. Installation ------------ Use ``pip``! -.. code-block:: sh +.. code-block:: console $ pip install natsort @@ -361,7 +368,7 @@ `fastnumbers `_ and ``icu`` for `PyICU `_. -.. code-block:: sh +.. code-block:: console # Install both optional dependencies. $ pip install natsort[fast,icu] @@ -377,27 +384,95 @@ After installing ``tox``, running tests is as simple as executing the following in the ``natsort`` directory: -.. code-block:: sh +.. code-block:: console $ tox ``tox`` will create virtual a virtual environment for your tests and install all the needed testing requirements for you. You can specify a particular python version -with the ``-e`` flag, e.g. ``tox -e py36``. - -If you do not wish to use ``tox``, you can install the testing dependencies and run the -tests manually using `pytest `_ - ``natsort`` -contains a ``Pipfile`` for use with `pipenv `_ that -makes it easy for you to install the testing dependencies: - -.. code-block:: sh - - $ pipenv install --skip-lock --dev - $ pipenv run python -m pytest +with the ``-e`` flag, e.g. ``tox -e py36``. Static analysis is done with ``tox -e flake8``. +You can see all available testing environments with ``tox --listenvs``. + +If you do not wish to use ``tox``, you can install the testing dependencies with the +``dev-requirements.txt`` file and then run the tests manually using +`pytest `_. + +.. code-block:: console + + $ pip install -r dev-requirements.txt + $ python -m pytest Note that above I invoked ``python -m pytest`` instead of just ``pytest`` - this is because `the former puts the CWD on sys.path `_. +How to Build Documentation +-------------------------- + +If you want to build the documentation for ``natsort``, it is recommended to use ``tox``: + +.. code-block:: console + + $ tox -e docs + +This will place the documentation in ``build/sphinx/html``. If you do not +which to use ``tox``, you can do the following: + +.. code-block:: console + + $ pip install sphinx sphinx_rtd_theme + $ python setup.py build_sphinx + +Deprecation Schedule +-------------------- + +Dropping Python 2.7 Support ++++++++++++++++++++++++++++ + +``natsort`` version 7.0.0 will drop support for Python 2.7. + +The version 6.X branch will remain as a "long term support" branch where bug fixes +are applied so that users who cannot update from Python 2.7 will not be forced to +use a buggy ``natsort`` version. Once version 7.0.0 is released, new features +will not be added to version 6.X, only bug fixes. + +Deprecated APIs ++++++++++++++++ + +In ``natsort`` version 6.0.0, the following APIs and functions were removed + + - ``number_type`` keyword argument (deprecated since 3.4.0) + - ``signed`` keyword argument (deprecated since 3.4.0) + - ``exp`` keyword argument (deprecated since 3.4.0) + - ``as_path`` keyword argument (deprecated since 3.4.0) + - ``py3_safe`` keyword argument (deprecated since 3.4.0) + - ``ns.TYPESAFE`` (deprecated since version 5.0.0) + - ``ns.DIGIT`` (deprecated since version 5.0.0) + - ``ns.VERSION`` (deprecated since version 5.0.0) + - ``versorted()`` (discouraged since version 4.0.0, officially deprecated since version 5.5.0) + - ``index_versorted()`` (discouraged since version 4.0.0, officially deprecated since version 5.5.0) + +In general, if you want to determine if you are using deprecated APIs you can run your +code with the following flag + +.. code-block:: console + + $ python -Wdefault::DeprecationWarning my-code.py + +By default ``DeprecationWarnings`` are not shown, but this will cause them to be shown. +Alternatively, you can just set the environment variable ``PYTHONWARNINGS`` to +"default::DeprecationWarning" and then run your code. + +Dropped Pipenv for Development +++++++++++++++++++++++++++++++ + +``natsort`` version 6.0.0 no longer uses `Pipenv `_ +to install development dependencies. + +Dropped Python 2.6 and 3.3 Support +++++++++++++++++++++++++++++++++++ + +``natsort`` version 6.0.0 dropped support for Python 2.6 and Python 3.3. + Author ------ @@ -406,4 +481,6 @@ History ------- -Please visit the `changelog `_. +Please visit the changelog +`on GitHub `_ or +`in the documentation `_. diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..e9af60b --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,9 @@ +coverage +pytest >= 3.5 +pytest-cov +pytest-mock >= 1.1 +hypothesis >= 3.8.0 +pytest-faulthandler; platform_python_implementation == 'CPython' +semver +# These packages are standard on newer python versions. +pathlib; python_version < '3.4' diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..5e7a482 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,97 @@ +.. default-domain:: py +.. currentmodule:: natsort + +.. _api: + +natsort API +=========== + +.. contents:: + :local: + +Standard API +------------ + +:func:`~natsort.natsorted` +++++++++++++++++++++++++++ + +.. autofunction:: natsorted + +The :class:`~natsort.ns` enum ++++++++++++++++++++++++++++++ + +.. autodata:: ns + :annotation: + +:func:`~natsort.natsort_key` +++++++++++++++++++++++++++++ + +.. autofunction:: natsort_key + +:func:`~natsort.natsort_keygen` ++++++++++++++++++++++++++++++++ + +.. autofunction:: natsort_keygen + +Convenience Functions +--------------------- + +:func:`~natsort.realsorted` ++++++++++++++++++++++++++++ + +.. autofunction:: realsorted + +:func:`~natsort.humansorted` +++++++++++++++++++++++++++++ + +.. autofunction:: humansorted + +:func:`~natsort.index_natsorted` +++++++++++++++++++++++++++++++++ + +.. autofunction:: index_natsorted + +:func:`~natsort.index_realsorted` ++++++++++++++++++++++++++++++++++ + +.. autofunction:: index_realsorted + +:func:`~natsort.index_humansorted` +++++++++++++++++++++++++++++++++++ + +.. autofunction:: index_humansorted + +:func:`~natsort.order_by_index` ++++++++++++++++++++++++++++++++ + +.. autofunction:: order_by_index + +.. _bytes_help: + +Help With Bytes On Python 3 ++++++++++++++++++++++++++++ + +The official stance of :mod:`natsort` is to not support `bytes` for +sorting; there is just too much that can go wrong when trying to automate +conversion between `bytes` and `str`. But rather than completely give up +on `bytes`, :mod:`natsort` provides three functions that make it easy to +quickly decode `bytes` to `str` so that sorting is possible. + +.. autofunction:: decoder + +.. autofunction:: as_ascii + +.. autofunction:: as_utf8 + +.. _function_help: + +Help With Creating Function Keys +++++++++++++++++++++++++++++++++ + +If you need to create a complicated *key* argument to (for example) +:func:`natsorted` that is actually multiple functions called one after the other, +the following function can help you easily perform this action. It is +used internally to :mod:`natsort`, and has been exposed publically for +the convenience of the user. + +.. autofunction:: chain_functions diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..1bc5475 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,6 @@ +.. _changelog: + +Changelog +--------- + +.. include:: ../CHANGELOG.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..f4fbab5 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,275 @@ +# -*- coding: utf-8 -*- +# +# natsort documentation build configuration file, created by +# sphinx-quickstart on Thu Jul 17 21:01:29 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax', + 'sphinx.ext.napoleon', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'natsort' +# noinspection PyShadowingBuiltins +copyright = u'2014, Seth M. Morton' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The full version, including alpha/beta/rc tags. +release = '6.0.0' +# The short X.Y version. +version = '.'.join(release.split('.')[0:2]) + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# exclude_patterns = ['solar/*'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' +highlight_language = 'python' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +on_rtd = os.environ.get('READTHEDOCS') == 'True' +if on_rtd: + html_theme = 'default' +else: + import sphinx_rtd_theme + + html_theme = 'sphinx_rtd_theme' + # html_theme = 'solar' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['.'] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'natsortdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'natsort.tex', u'natsort Documentation', + u'Seth M. Morton', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'natsort', u'natsort Documentation', + [u'Seth M. Morton'], 1) +] + +# If true, show URL addresses after external links. +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'natsort', u'natsort Documentation', + u'Seth M. Morton', 'natsort', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# texinfo_appendices = [] + +# If false, no module index is generated. +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# texinfo_no_detailmenu = False + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} diff --git a/docs/examples.rst b/docs/examples.rst new file mode 100644 index 0000000..e44aa1c --- /dev/null +++ b/docs/examples.rst @@ -0,0 +1,385 @@ +.. default-domain:: py +.. currentmodule:: natsort + +.. _examples: + +Examples and Recipes +==================== + +If you want more detailed examples than given on this page, please see +https://github.com/SethMMorton/natsort/tree/master/tests. + +.. contents:: + :local: + +Basic Usage +----------- + +In the most basic use case, simply import :func:`~natsorted` and use +it as you would :func:`sorted`: + +.. code-block:: pycon + + >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] + >>> sorted(a) + ['1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '2 ft 7 in', '7 ft 6 in'] + >>> from natsort import natsorted, ns + >>> natsorted(a) + ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] + +Sort Version Numbers +-------------------- + +As of :mod:`natsort` version >= 4.0.0, :func:`~natsorted` will work for +well-behaved version numbers, like ``MAJOR.MINOR.PATCH``. + +.. _rc_sorting: + +Sorting More Expressive Versioning Schemes +++++++++++++++++++++++++++++++++++++++++++ + +By default, if you wish to sort versions that are not as simple as +``MAJOR.MINOR.PATCH`` (or similar), you may not get the results you expect: + +.. code-block:: pycon + + >>> a = ['1.2', '1.2rc1', '1.2beta2', '1.2beta1', '1.2alpha', '1.2.1', '1.1', '1.3'] + >>> natsorted(a) + ['1.1', '1.2', '1.2.1', '1.2alpha', '1.2beta1', '1.2beta2', '1.2rc1', '1.3'] + +To make the '1.2' pre-releases come before '1.2.1', you need to use the following +recipe: + +.. code-block:: pycon + + >>> natsorted(a, key=lambda x: x.replace('.', '~')) + ['1.1', '1.2', '1.2alpha', '1.2beta1', '1.2beta2', '1.2rc1', '1.2.1', '1.3'] + +If you also want '1.2' after all the alpha, beta, and rc candidates, you can +modify the above recipe: + +.. code-block:: pycon + + >>> natsorted(a, key=lambda x: x.replace('.', '~')+'z') + ['1.1', '1.2alpha', '1.2beta1', '1.2beta2', '1.2rc1', '1.2', '1.2.1', '1.3'] + +Please see `this issue `_ to +see why this works. + +Sorting Rigorously Defined Versioning Schemes (e.g. SemVer or PEP 440) +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +If you know you are using a versioning scheme that follows a well-defined format +for which there is third-party module support, you should use those modules +to assist in sorting. Some examples might be +`PEP 440 `_ or +`SemVer `_. + +If we are being honest, using these methods to parse a version means you don't +need to use :mod:`natsort` - you should probably just use :func:`sorted` directly. +Here's an example with SemVer: + +.. code-block:: pycon + + >>> from semver import parse_version_info + >>> a = ['3.4.5-pre.1', '3.4.5', '3.4.5-pre.2+build.4'] + >>> sorted(a, key=parse_version_info) + ['3.4.5-pre.1', '3.4.5-pre.2+build.4', '3.4.5'] + +.. _path_sort: + +Sort OS-Generated Paths +----------------------- + +In some cases when sorting file paths with OS-Generated names, the default +:mod:`~natsorted` algorithm may not be sufficient. In cases like these, +you may need to use the ``ns.PATH`` option: + +.. code-block:: pycon + + >>> a = ['./folder/file (1).txt', + ... './folder/file.txt', + ... './folder (1)/file.txt', + ... './folder (10)/file.txt'] + >>> natsorted(a) + ['./folder (1)/file.txt', './folder (10)/file.txt', './folder/file (1).txt', './folder/file.txt'] + >>> natsorted(a, alg=ns.PATH) + ['./folder/file.txt', './folder/file (1).txt', './folder (1)/file.txt', './folder (10)/file.txt'] + +Locale-Aware Sorting (Human Sorting) +------------------------------------ + +.. note:: + Please read :ref:`locale_issues` before using ``ns.LOCALE``, :func:`humansorted`, + or :func:`index_humansorted`. + +You can instruct :mod:`natsort` to use locale-aware sorting with the +``ns.LOCALE`` option. In addition to making this understand non-ASCII +characters, it will also properly interpret non-'.' decimal separators +and also properly order case. It may be more convenient to just use +the :func:`humansorted` function: + +.. code-block:: pycon + + >>> from natsort import humansorted + >>> import locale + >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') + 'en_US.UTF-8' + >>> a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana'] + >>> natsorted(a, alg=ns.LOCALE) + ['apple', 'Apple', 'banana', 'Banana', 'corn', 'Corn'] + >>> humansorted(a) + ['apple', 'Apple', 'banana', 'Banana', 'corn', 'Corn'] + +You may find that if you do not explicitly set the locale your results may not +be as you expect... I have found that it depends on the system you are on. +If you use `PyICU `_ (see below) then +you should not need to do this. + +.. _case_sort: + +Controlling Case When Sorting +----------------------------- + +For non-numbers, by default :mod:`natsort` used ordinal sorting (i.e. +it sorts by the character's value in the ASCII table). For example: + +.. code-block:: pycon + + >>> a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana'] + >>> natsorted(a) + ['Apple', 'Banana', 'Corn', 'apple', 'banana', 'corn'] + +There are times when you wish to ignore the case when sorting, +you can easily do this with the ``ns.IGNORECASE`` option: + +.. code-block:: pycon + + >>> natsorted(a, alg=ns.IGNORECASE) + ['Apple', 'apple', 'Banana', 'banana', 'corn', 'Corn'] + +Note thats since Python's sorting is stable, the order of equivalent +elements after lowering the case is the same order they appear in the +original list. + +Upper-case letters appear first in the ASCII table, but many natural +sorting methods place lower-case first. To do this, use +``ns.LOWERCASEFIRST``: + +.. code-block:: pycon + + >>> natsorted(a, alg=ns.LOWERCASEFIRST) + ['apple', 'banana', 'corn', 'Apple', 'Banana', 'Corn'] + +It may be undesirable to have the upper-case letters grouped together +and the lower-case letters grouped together; most would expect all +"a"s to bet together regardless of case, and all "b"s, and so on. To +achieve this, use ``ns.GROUPLETTERS``: + +.. code-block:: pycon + + >>> natsorted(a, alg=ns.GROUPLETTERS) + ['Apple', 'apple', 'Banana', 'banana', 'Corn', 'corn'] + +You might combine this with ``ns.LOWERCASEFIRST`` to get what most +would expect to be "natural" sorting: + +.. code-block:: pycon + + >>> natsorted(a, alg=ns.G | ns.LF) + ['apple', 'Apple', 'banana', 'Banana', 'corn', 'Corn'] + +Customizing Float Definition +---------------------------- + +You can make :func:`~natsorted` search for any float that would be +a valid Python float literal, such as 5, 0.4, -4.78, +4.2E-34, etc. +using the ``ns.FLOAT`` key. You can disable the exponential component +of the number with ``ns.NOEXP``. + +.. code-block:: pycon + + >>> a = ['a50', 'a51.', 'a+50.4', 'a5.034e1', 'a+50.300'] + >>> natsorted(a, alg=ns.FLOAT) + ['a50', 'a5.034e1', 'a51.', 'a+50.300', 'a+50.4'] + >>> natsorted(a, alg=ns.FLOAT | ns.SIGNED) + ['a50', 'a+50.300', 'a5.034e1', 'a+50.4', 'a51.'] + >>> natsorted(a, alg=ns.FLOAT | ns.SIGNED | ns.NOEXP) + ['a5.034e1', 'a50', 'a+50.300', 'a+50.4', 'a51.'] + +For convenience, the ``ns.REAL`` option is provided which is a shortcut +for ``ns.FLOAT | ns.SIGNED`` and can be used to sort on real numbers. +This can be easily accessed with the :func:`~realsorted` convenience +function. Please note that the behavior of the :func:`~realsorted` function +was the default behavior of :func:`~natsorted` for :mod:`natsort` +version < 4.0.0: + +.. code-block:: pycon + + >>> natsorted(a, alg=ns.REAL) + ['a50', 'a+50.300', 'a5.034e1', 'a+50.4', 'a51.'] + >>> from natsort import realsorted + >>> realsorted(a) + ['a50', 'a+50.300', 'a5.034e1', 'a+50.4', 'a51.'] + +.. _custom_sort: + +Using a Custom Sorting Key +-------------------------- + +Like the built-in ``sorted`` function, ``natsorted`` can accept a custom +sort key so that: + +.. code-block:: pycon + + >>> from operator import attrgetter, itemgetter + >>> a = [['a', 'num4'], ['b', 'num8'], ['c', 'num2']] + >>> natsorted(a, key=itemgetter(1)) + [['c', 'num2'], ['a', 'num4'], ['b', 'num8']] + >>> class Foo: + ... def __init__(self, bar): + ... self.bar = bar + ... def __repr__(self): + ... return "Foo('{}')".format(self.bar) + >>> b = [Foo('num3'), Foo('num5'), Foo('num2')] + >>> natsorted(b, key=attrgetter('bar')) + [Foo('num2'), Foo('num3'), Foo('num5')] + +Generating a Natsort Key +------------------------ + +If you need to sort a list in-place, you cannot use :func:`~natsorted`; you +need to pass a key to the :meth:`list.sort` method. The function +:func:`~natsort_keygen` is a convenient way to generate these keys for you: + +.. code-block:: pycon + + >>> from natsort import natsort_keygen + >>> a = ['a50', 'a51.', 'a50.4', 'a5.034e1', 'a50.300'] + >>> natsort_key = natsort_keygen(alg=ns.FLOAT) + >>> a.sort(key=natsort_key) + >>> a + ['a50', 'a50.300', 'a5.034e1', 'a50.4', 'a51.'] + +:func:`~natsort_keygen` has the same API as :func:`~natsorted` (minus the +`reverse` option). + +Natural Sorting with ``cmp`` (Python 2 only) +-------------------------------------------- + +.. note:: + This is a Python2-only feature! The :func:`natcmp` function is not + exposed on Python3. Because this documentation is built with + Python3, you will not find :func:`natcmp` in the API. + +If you are using a legacy codebase that requires you to use :func:`cmp` instead +of a key-function, you can use :func:`~natcmp`. + +.. code-block:: pycon + + >>> import sys + >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] + >>> if sys.version_info[0] == 2: + ... from natsort import natcmp + ... sorted(a, cmp=natcmp) + ... else: + ... natsorted(a) # so docstrings don't fail + ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] + +:func:`natcmp` also accepts an ``alg`` argument so you can customize your +sorting experience. + +Sorting Multiple Lists According to a Single List +------------------------------------------------- + +Sometimes you have multiple lists, and you want to sort one of those +lists and reorder the other lists according to how the first was sorted. +To achieve this you could use the :func:`~index_natsorted` in combination +with the convenience function +:func:`~order_by_index`: + +.. code-block:: pycon + + >>> from natsort import index_natsorted, order_by_index + >>> a = ['a2', 'a9', 'a1', 'a4', 'a10'] + >>> b = [4, 5, 6, 7, 8] + >>> c = ['hi', 'lo', 'ah', 'do', 'up'] + >>> index = index_natsorted(a) + >>> order_by_index(a, index) + ['a1', 'a2', 'a4', 'a9', 'a10'] + >>> order_by_index(b, index) + [6, 4, 7, 5, 8] + >>> order_by_index(c, index) + ['ah', 'hi', 'do', 'lo', 'up'] + +Returning Results in Reverse Order +---------------------------------- + +Just like the :func:`sorted` built-in function, you can supply the +``reverse`` option to return the results in reverse order: + +.. code-block:: pycon + + >>> a = ['a2', 'a9', 'a1', 'a4', 'a10'] + >>> natsorted(a, reverse=True) + ['a10', 'a9', 'a4', 'a2', 'a1'] + +Sorting Bytes on Python 3 +------------------------- + +Python 3 is rather strict about comparing strings and bytes, and this +can make it difficult to deal with collections of both. Because of the +challenge of guessing which encoding should be used to decode a bytes +array to a string, :mod:`natsort` does *not* try to guess and automatically +convert for you; in fact, the official stance of :mod:`natsort` is to +not support sorting bytes. Instead, some decoding convenience functions +have been provided to you (see :ref:`bytes_help`) that allow you to +provide a codec for decoding bytes through the ``key`` argument that +will allow :mod:`natsort` to convert byte arrays to strings for sorting; +these functions know not to raise an error if the input is not a byte +array, so you can use the key on any arbitrary collection of data. + +.. code-block:: pycon + + >>> from natsort import as_ascii + >>> a = [b'a', 14.0, 'b'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str()) + >>> natsorted(a, key=as_ascii) == [14.0, b'a', 'b'] + True + +Additionally, regular expressions cannot be run on byte arrays, making it +so that :mod:`natsort` cannot parse them for numbers. As a result, if you +run :mod:`natsort` on a list of bytes, you will get results that are like +Python's default sorting behavior. Of course, you can use the decoding +functions to solve this: + +.. code-block:: pycon + + >>> from natsort import as_utf8 + >>> a = [b'a56', b'a5', b'a6', b'a40'] + >>> natsorted(a) # doctest: +SKIP + [b'a40', b'a5', b'a56', b'a6'] + >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56'] + True + +If you need a codec different from ASCII or UTF-8, you can use +:func:`decoder` to generate a custom key: + +.. code-block:: pycon + + >>> from natsort import decoder + >>> a = [b'a56', b'a5', b'a6', b'a40'] + >>> natsorted(a, key=decoder('latin1')) == [b'a5', b'a6', b'a40', b'a56'] + True + +Sorting a Pandas DataFrame +-------------------------- + +As of Pandas version 0.16.0, the sorting methods do not accept a ``key`` argument, +so you cannot simply pass :func:`natsort_keygen` to a Pandas DataFrame and sort. +This request has been made to the Pandas devs; see +`issue 3942 `_ if you are interested. +If you need to sort a Pandas DataFrame, please check out +`this answer on StackOverflow `_ +for ways to do this without the ``key`` argument to ``sort``. diff --git a/docs/howitworks.rst b/docs/howitworks.rst new file mode 100644 index 0000000..3ecb406 --- /dev/null +++ b/docs/howitworks.rst @@ -0,0 +1,1113 @@ +.. default-domain:: py +.. currentmodule:: natsort + +.. _howitworks: + +How Does Natsort Work? +====================== + +.. contents:: + :local: + +:mod:`natsort` works by breaking strings into smaller sub-components (numbers +or everything else), and returning these components in a tuple. Sorting +tuples in Python is well-defined, and this fact is used to sort the input +strings properly. But how does one break a string into sub-components? +And what does one do to those components once they are split? Below I +will explain the algorithm that was chosen for the :mod:`natsort` module, +and some of the thinking that went into those design decisions. I will +also mention some of the stumbling blocks I ran into because +`getting sorting right is surprisingly hard`_. + +If you are impatient, you can skip to :ref:`tldr1` for the algorithm +in the simplest case, and :ref:`tldr2` +to see what extra code is needed to handle special cases. + +First, How Does Natural Sorting Work At a High Level? +----------------------------------------------------- + +If I want to compare '2 ft 7 in' to '2 ft 11 in', I might do the following + +.. code-block:: pycon + + >>> '2 ft 7 in' < '2 ft 11 in' + False + +We as humans know that the above should be true, but why does Python think it +is false? Here is how it is performing the comparison: + +.. code-block:: none + + '2' <=> '2' ==> equal, so keep going + ' ' <=> ' ' ==> equal, so keep going + 'f' <=> 'f' ==> equal, so keep going + 't' <=> 't' ==> equal, so keep going + ' ' <=> ' ' ==> equal, so keep going + '7' <=> '1' ==> different, use result of '7' < '1' + +'7' evaluates as greater than '1' so the statement is false. When sorting, if +a value is less than another it is placed first, so in our above example +'2 ft 11 in' would end up before '2 ft 7 in', which is not correct. What to do? + +The best way to handle this is to break the string into sub-components +of numbers and non-numbers, and then convert the numeric parts into +:func:`float` or :func:`int` types. This will force Python to +actually understand the context of what it is sorting and then "do the +right thing." Luckily, it handles sorting lists of strings right out-of-the-box, +so the only hard part is actually making this string-to-list transformation +and then Python will handle the rest. + +.. code-block:: none + + '2 ft 7 in' ==> (2, ' ft ', 7, ' in') + '2 ft 11 in' ==> (2, ' ft ', 11, ' in') + +When Python compares the two, it roughly follows the below logic: + +.. code-block:: none + + 2 <=> 2 ==> equal, so keep going + ' ft ' <=> ' ft ' ==> a string is a special type of sequence - evaluate each character individually + || + --> + ' ' <=> ' ' ==> equal, so keep going + 'f' <=> 'f' ==> equal, so keep going + 't' <=> 't' ==> equal, so keep going + ' ' <=> ' ' ==> equal, so keep going + <== Back to parent sequence + 7 <=> 11 ==> different, use the result of 7 < 11 + +Clearly, seven is less than eleven, so our comparison is as we expect, and we +would get the sorting order we wanted. + +At its heart, :mod:`natsort` is simply a tool to break strings into tuples, +turning numbers in strings (i.e. ``'79'``) into *ints* and *floats* as it does this. + +Natsort's Approach +------------------ + +.. contents:: + :local: + +Decomposing Strings Into Sub-Components ++++++++++++++++++++++++++++++++++++++++ + +The first major hurtle to overcome is to decompose the string into sub-components. +Remarkably, this turns out to be the easy part, owing mostly to Python's easy access +to regular expressions. Breaking an arbitrary string based on a pattern is pretty +straightforward. + +.. code-block:: pycon + + >>> import re + >>> re.split(r'(\d+)', '2 ft 11 in') + ['', '2', ' ft ', '11', ' in'] + +Clear (assuming you can read regular expressions) and concise. + +The reason I began developing :mod:`natsort` in the first place was because I +needed to handle the natural sorting of strings containing *real numbers*, not just +unsigned integers as the above example contains. By real numbers, I mean those like +``-45.4920E-23``. :mod:`natsort` can handle just about any number definition; +to that end, here are all the regular expressions used in :mod:`natsort`: + +.. code-block:: pycon + + >>> unsigned_int = r'([0-9]+)' + >>> signed_int = r'([-+]?[0-9]+)' + >>> unsigned_float = r'((?:[0-9]+\.?[0-9]*|\.[0-9]+)(?:[eE][-+]?[0-9]+)?)' + >>> signed_float = r'([-+]?(?:[0-9]+\.?[0-9]*|\.[0-9]+)(?:[eE][-+]?[0-9]+)?)' + >>> unsigned_float_no_exponent = r'((?:[0-9]+\.?[0-9]*|\.[0-9]+))' + >>> signed_float_no_exponent = r'([-+]?(?:[0-9]+\.?[0-9]*|\.[0-9]+))' + +Note that ``"inf"`` and ``"nan"`` are deliberately omitted from the float definition because you +wouldn't want (for example) ``"banana"`` to be converted into ``['ba', 'nan', 'a']``, +Let's see an example: + +.. code-block:: pycon + + >>> re.split(signed_float, 'The mass of 3 electrons is 2.732815068E-30 kg') + ['The mass of ', '3', ' electrons is ', '2.732815068E-30', ' kg'] + +.. note:: + + It is a bit of a lie to say the above are the complete regular expressions. In the + actual code there is also handling for non-ASCII unicode characters (such as ⑦), + but I will ignore that aspect of :mod:`natsort` in this discussion. + +Now, when the user wants to change the definition of a number, it is as easy as changing +the pattern supplied to the regular expression engine. + +Choosing the right default is hard, though (well, in this case it shouldn't have been +but I was rather thick-headed). +In retrospect, it should have been obvious that since essentially all the code examples +I had/have seen for natural sorting were for *unsigned integers*, I should have made the default +definition of a number an *unsigned integer*. But, in the brash days of my youth I assumed +that since my use case was real numbers, everyone else would be happier sorting by real numbers; +so, I made the default definition of a number a *signed float with exponent*. +`This astonished`_ `a lot`_ `of people`_ +(`and some people aren't very nice when they are astonished`_). +Starting with :mod:`natsort` version 4.0.0 the default number definition was +changed to an *unsigned integer* which satisfies the "least astonishment" principle, and +I have not heard a complaint since. + +Coercing Strings Containing Numbers Into Numbers +++++++++++++++++++++++++++++++++++++++++++++++++ + +There has been some debate on Stack Overflow as to what method is best to +coerce a string to a number if it can be coerced, and leaving it alone otherwise +(see `this one for coercion`_ and `this one for checking`_ for some high traffic questions), +but it mostly boils down to two different solutions, shown here: + +.. code-block:: pycon + + >>> def coerce_try_except(x): + ... try: + ... return int(x) + ... except ValueError: + ... return x + ... + >>> def coerce_regex(x): + ... # Note that precompiling the regex is more performant, + ... # but I do not show that here for clarity's sake. + ... return int(x) if re.match(r'[-+]?\d+$', x) else x + ... + +Here are some timing results run on my machine: + +.. code-block:: pycon + + In [0]: numbers = list(map(str, range(100))) # A list of numbers as strings + + In [1]: not_numbers = ['banana' + x for x in numbers] + + In [2]: %timeit [coerce_try_except(x) for x in numbers] + 10000 loops, best of 3: 51.1 µs per loop + + In [3]: %timeit [coerce_try_except(x) for x in not_numbers] + 1000 loops, best of 3: 289 µs per loop + + In [4]: %timeit [coerce_regex(x) for x in not_numbers] + 10000 loops, best of 3: 67.6 µs per loop + + In [5]: %timeit [coerce_regex(x) for x in numbers] + 10000 loops, best of 3: 123 µs per loop + +What can we learn from this? The ``try: except`` method (arguably the most "pythonic" +of the solutions) is best for numeric input, but performs over 5X slower for non-numeric +input. Conversely, the regular expression method, though slower than ``try: except`` for +both input types, is more efficient for non-numeric input than for input that can be +converted to an ``int``. Further, even though the regular expression method is slower +for both input types, it is always at least twice as fast as the worst case for the +``try: except``. + +Why do I care? Shouldn't I just pick a method and not worry about it? Probably. However, +I am very conscious about the performance of :mod:`natsort`, and want it to be a true +drop-in replacement for :func:`sorted` without having to incur a performance penalty. +For the purposes of :mod:`natsort`, there is no clear winner between the two algorithms - +the data being passed to this function will likely be a mix of numeric and non-numeric +string content. Do I use the ``try: except`` method and hope the speed gains on +numbers will offset the non-number performance, or do I use regular expressions and +take the more stable performance? + +It turns out that within the context of :mod:`natsort`, some assumptions can be +made that make a hybrid approach attractive. Because all strings are pre-split +into numeric and non-numeric content *before* being passed to this coercion function, +the assumption can be made that *if a string begins with a digit or a sign, it +can be coerced into a number*. + +.. code-block:: pycon + + >>> def coerce_to_int(x): + ... if x[0] in '0123456789+-': + ... try: + ... return int(x) + ... except ValueError: + ... return x + ... else: + ... return x + ... + +So how does this perform compared to the standard coercion methods? + +.. code-block:: pycon + + In [6]: %timeit [coerce_to_int(x) for x in numbers] + 10000 loops, best of 3: 71.6 µs per loop + + In [7]: %timeit [coerce_to_int(x) for x in not_numbers] + 10000 loops, best of 3: 26.4 µs per loop + +The hybrid method eliminates most of the time wasted on numbers checking that it +is in fact a number before passing to :func:`int`, and eliminates the time wasted +in the exception stack for input that is not a number. + +That's as fast as we can get, right? In pure Python, probably. At least, it's +close. But because I am crazy and a glutton for punishment, I decided to see +if I could get any faster writing a C extension. It's called +`fastnumbers`_ and contains a C implementation of the above coercion functions +called :func:`fast_int`. How does it fair? Pretty well. + +.. code-block:: pycon + + In [8]: %timeit [fast_int(x) for x in numbers] + 10000 loops, best of 3: 30.9 µs per loop + + In [9]: %timeit [fast_int(x) for x in not_numbers] + 10000 loops, best of 3: 30 µs per loop + +During development of :mod:`natsort`, I wanted to ensure that using it did not +get in the way of a user's program by introducing a performance penalty to their code. +To that end, I do not feel like my adventures down the rabbit hole of optimization +of coercion functions was a waste; I can confidently look users in the eye and +say I considered every option in ensuring :mod:`natsort` is as efficient as possible. +This is why if `fastnumbers`_ is installed it will be used for this step, +and otherwise the hybrid method will be used. + +.. note:: + + Modifying the hybrid coercion function for floats is straightforward. + + .. code-block:: pycon + + >>> def coerce_to_float(x): + ... if x[0] in '.0123456789+-' or x.lower().lstrip()[:3] in ('nan', 'inf'): + ... try: + ... return float(x) + ... except ValueError: + ... return x + ... else: + ... return x + ... + +.. _tldr1: + +TL;DR 1 - The Simple "No Special Cases" Algorithm ++++++++++++++++++++++++++++++++++++++++++++++++++ + +At this point, our :mod:`natsort` algorithm is essentially the following: + +.. code-block:: pycon + + >>> import re + >>> def natsort_key(x, as_float=False, signed=False): + ... if as_float: + ... regex = signed_float if signed else unsigned_float + ... else: + ... regex = signed_int if signed else unsigned_int + ... split_input = re.split(regex, x) + ... split_input = filter(None, split_input) # removes null strings + ... coerce = coerce_to_float if as_float else coerce_to_int + ... return tuple(coerce(s) for s in split_input) + ... + +I have written the above for clarity and not performance. +This pretty much matches `most natural sort solutions for python on Stack Overflow`_ +(except the above includes customization of the definition of a number). + +Special Cases Everywhere! +------------------------- + +.. contents:: + :local: + +.. image:: special_cases_everywhere.jpg + +If what I described in :ref:`TL;DR 1 ` were +all that :mod:`natsort` needed to +do then there probably wouldn't be much need for a third-party module, right? +Probably. But it turns out that in real-world data there are a lot of +special cases that need to be handled, and in true `80%/20%`_ fashion, the +majority of the code in :mod:`natsort` is devoted to handling special cases +like those described below. + +Sorting Filesystem Paths +++++++++++++++++++++++++ + +`The first major special case I encountered was sorting filesystem paths`_ +(if you go to the link, you will see I didn't handle it well for a year... +this was before I fully realized how much functionality I could really add +to :mod:`natsort`). Let's apply the :func:`natsort_key` from above to some +filesystem paths that you might see being auto-generated from your operating +system: + +.. code-block:: pycon + + >>> paths = ['/p/Folder (10)/file.tar.gz', + ... '/p/Folder/file.tar.gz', + ... '/p/Folder (1)/file (1).tar.gz', + ... '/p/Folder (1)/file.tar.gz'] + >>> sorted(paths, key=natsort_key) + ['/p/Folder (1)/file (1).tar.gz', '/p/Folder (1)/file.tar.gz', '/p/Folder (10)/file.tar.gz', '/p/Folder/file.tar.gz'] + +Well that's not right! What is ``'/p/Folder/file.tar.gz'`` doing at the end? +It has to do with the numerical ASCII code assigned to the space and +``/`` characters in the `ASCII table`_. According to the `ASCII table`_, the +space character (number 32) comes before the ``/`` character (number 47). If +we remove the common prefix in all of the above strings (``'/p/Folder'``), we +can see why this happens: + +.. code-block:: pycon + + >>> ' (1)/file.tar.gz' < '/file.tar.gz' + True + >>> ' ' < '/' + True + +This isn't very convenient... how do we solve it? We can split the path +across the path separators and then sort. A convenient way do to this is +with the :data:`Path.parts ` property from +:mod:`pathlib`: + +.. code-block:: pycon + + >>> import pathlib + >>> sorted(paths, key=lambda x: tuple(natsort_key(s) for s in pathlib.Path(x).parts)) + ['/p/Folder/file.tar.gz', '/p/Folder (1)/file (1).tar.gz', '/p/Folder (1)/file.tar.gz', '/p/Folder (10)/file.tar.gz'] + +Almost! It seems like there is some funny business going on in the final +filename component as well. We can solve that nicely and quickly with +:data:`Path.suffixes ` and :data:`Path.stem +`. + +.. code-block:: pycon + + >>> def decompose_path_into_components(x): + ... path_split = list(pathlib.Path(x).parts) + ... # Remove the final filename component from the path. + ... final_component = pathlib.Path(path_split.pop()) + ... # Split off all the extensions. + ... suffixes = final_component.suffixes + ... stem = final_component.name.replace(''.join(suffixes), '') + ... # Remove the '.' prefix of each extension, and make that + ... # final component a list of the stem and each suffix. + ... final_component = [stem] + [x[1:] for x in suffixes] + ... # Replace the split final filename component. + ... path_split.extend(final_component) + ... return path_split + ... + >>> def natsort_key_with_path_support(x): + ... return tuple(natsort_key(s) for s in decompose_path_into_components(x)) + ... + >>> sorted(paths, key=natsort_key_with_path_support) + ['/p/Folder/file.tar.gz', '/p/Folder (1)/file.tar.gz', '/p/Folder (1)/file (1).tar.gz', '/p/Folder (10)/file.tar.gz'] + +This works because in addition to breaking the input by path separators, the final +filename component is separated from its extensions as well [#f1]_. *Then*, each of these +separated components is sent to the :mod:`natsort` algorithm, so the result is +a tuple of tuples. Once that is done, we can see how comparisons can be done in +the expected manner. + +.. code-block:: pycon + + >>> a = natsort_key_with_path_support('/p/Folder (1)/file (1).tar.gz') + >>> a + (('/',), ('p',), ('Folder (', 1, ')'), ('file (', 1, ')'), ('tar',), ('gz',)) + >>> + >>> b = natsort_key_with_path_support('/p/Folder/file.tar.gz') + >>> b + (('/',), ('p',), ('Folder',), ('file',), ('tar',), ('gz',)) + >>> + >>> a > b + True + +Comparing Different Types on Python 3 ++++++++++++++++++++++++++++++++++++++ + +`The second major special case I encountered was sorting of different types`_. +If you are on Python 2 (i.e. legacy Python), this mostly doesn't matter *too* +much since it uses an arbitrary heuristic to allow traditionally un-comparable +types to be compared (such as comparing ``'a'`` to ``1``). However, on Python 3 +(i.e. Python) it simply won't let you perform such nonsense, raising a +:exc:`TypeError` instead. + +You can imagine that a module that breaks strings into tuples of numbers and +strings is walking a dangerous line if it does not have special handling for +comparing numbers and strings. My imagination was not so great at first. +Let's take a look at all the ways this can fail with real-world data. + +.. code-block:: pycon + + >>> def natsort_key_with_poor_real_number_support(x): + ... split_input = re.split(signed_float, x) + ... split_input = filter(None, split_input) # removes null strings + ... return tuple(coerce_to_float(s) for s in split_input) + >>> + >>> sorted([5, '4'], key=natsort_key_with_poor_real_number_support) + Traceback (most recent call last): + ... + TypeError: ... + >>> + >>> sorted(['12 apples', 'apples'], key=natsort_key_with_poor_real_number_support) + Traceback (most recent call last): + ... + TypeError: ... + >>> + >>> sorted(['version5.3.0', 'version5.3rc1'], key=natsort_key_with_poor_real_number_support) + Traceback (most recent call last): + ... + TypeError: ... + +Let's break these down. + +#. The integer ``5`` is sent to ``re.split`` which expects only strings + or bytes, which is a no-no. +#. ``natsort_key_with_poor_real_number_support('12 apples') < natsort_key_with_poor_real_number_support('apples')`` + is the same as ``(12.0, ' apples') < ('apples',)``, and thus a number gets + compared to a string [#f2]_ which also is a no-no. +#. This one scores big on the astonishment scale, especially if one accidentally + uses signed integers or real numbers when they mean to use unsigned integers. + ``natsort_key_with_poor_real_number_support('version5.3.0') < natsort_key_with_poor_real_number_support('version5.3rc1')`` + is the same as ``('version', 5.3, 0.0) < ('version', 5.3, 'rc', 1.0)``, so in the + third element a number gets compared to a string, once again the same + old no-no. (The same would happen with ``'version5-3'`` and ``'version5-a'``, + which would be come ``('version', 5, -3)`` and ``('version', 5, '-a')``). + +As you might expect, the solution to the first issue is to wrap the ``re.split`` +call in a ``try: except:`` block and handle the number specially if a +:exc:`TypeError` is raised. The second and third cases *could* be handled +in a "special case" manner, meaning only respond and do something different +if these problems are detected. But a less error-prone method is to ensure +that the data is correct-by-construction, and this can be done by ensuring +that the returned tuples *always* start with a string, and then alternate +in a string-number-string-number-string patter;n this can be achieved by +adding an empty string wherever the pattern is not followed [#f3]_. This ends +up working out pretty nicely because empty strings are always "less" than +any non-empty string, and we typically want numbers to come before strings. + +Let's take a look at how this works out. + +.. code-block:: pycon + + >>> from natsort.utils import sep_inserter + >>> list(sep_inserter(iter(['apples']), '')) + ['apples'] + >>> + >>> list(sep_inserter(iter([12, ' apples']), '')) + ['', 12, ' apples'] + >>> + >>> list(sep_inserter(iter(['version', 5, -3]), '')) + ['version', 5, '', -3] + >>> + >>> from natsort import natsort_keygen, ns + >>> natsort_key_with_good_real_number_support = natsort_keygen(alg=ns.REAL) + >>> + >>> sorted([5, '4'], key=natsort_key_with_good_real_number_support) + ['4', 5] + >>> + >>> sorted(['12 apples', 'apples'], key=natsort_key_with_good_real_number_support) + ['12 apples', 'apples'] + >>> + >>> sorted(['version5.3.0', 'version5.3rc1'], key=natsort_key_with_good_real_number_support) + ['version5.3.0', 'version5.3rc1'] + +How the "good" version works will be given in `TL;DR 2 - Handling Crappy, Real-World Input`_. + +Handling NaN +++++++++++++ + +`A rather unexpected special case I encountered was sorting collections containing NaN`_. +Let's see what happens when you try to sort a plain old list of numbers when there +is a **NaN** floating around in there. + +.. code-block:: pycon + + >>> danger = [7, float('nan'), 22.7, 19, -14, 59.123, 4] + >>> sorted(danger) + [7, nan, -14, 4, 19, 22.7, 59.123] + +Clearly that isn't correct, and for once it isn't my fault! +`It's hard to compare floating point numbers`_. By definition, **NaN** is unorderable +to any other number, and is never equal to any other number, including itself. + +.. code-block:: pycon + + >>> nan = float('nan') + >>> 5 > nan + False + >>> 5 < nan + False + >>> 5 == nan + False + >>> 5 != nan + True + >>> nan == nan + False + >>> nan != nan + True + +The implication of all this for us is that if there is an **NaN** in the +data-set we are trying to sort, the data-set will end up being sorted in +two separate yet individually sorted sequences - the one *before* the **NaN**, +and the one *after*. This is because the ``<`` operation that is used +to sort always returns :const:`False` with **NaN**. + +Because :mod:`natsort` aims to sort sequences in a way that does not surprise +the user, keeping this behavior is not acceptable (I don't require my users +to know how **NaN** will behave in a sorting algorithm). The simplest way to +satisfy the "least astonishment" principle is to substitute **NaN** with +some other value. But what value is *least* astonishing? I chose to replace +**NaN** with :math:`-\infty` so that these poorly behaved elements always +end up at the front where the users will most likely be alerted to their presence. + +.. code-block:: pycon + + >>> def fix_nan(x): + ... if x != x: # only true for NaN + ... return float('-inf') + ... else: + ... return x + ... + +Let's check out :ref:`TL;DR 2 ` to see how this can be +incorporated into the simple key function from :ref:`TL;DR 1 `. + +.. _tldr2: + +TL;DR 2 - Handling Crappy, Real-World Input ++++++++++++++++++++++++++++++++++++++++++++ + +Let's see how our elegant key function from :ref:`TL;DR 1 ` has +become bastardized in order to support handling mixed real-world data +and user customizations. + + >>> def natsort_key(x, as_float=False, signed=False, as_path=False): + ... if as_float: + ... regex = signed_float if signed else unsigned_float + ... else: + ... regex = signed_int if signed else unsigned_int + ... try: + ... if as_path: + ... x = decompose_path_into_components(x) # Decomposes into list of strings + ... # If this raises a TypeError, input is not a string. + ... split_input = re.split(regex, x) + ... except TypeError: + ... try: + ... # Does this need to be applied recursively (list-of-list)? + ... return tuple(map(natsort_key, x)) + ... except TypeError: + ... # Must be a number + ... ret = ('', fix_nan(x)) # Maintain string-number-string pattern + ... return (ret,) if as_path else ret # as_path returns tuple-of-tuples + ... else: + ... split_input = filter(None, split_input) # removes null strings + ... # Note that the coerce_to_int/coerce_to_float functions + ... # are also modified to use the fix_nan function. + ... if as_float: + ... coerced_input = (coerce_to_float(s) for s in split_input) + ... else: + ... coerced_input = (coerce_to_int(s) for s in split_input) + ... return tuple(sep_inserter(coerced_input, '')) + ... + +And this doesn't even show handling :class:`bytes` type! Notice that we have +to do non-obvious things like modify the return form of numbers when ``as_path`` +is given, just to avoid comparing strings and numbers for the case in which a user provides +input like ``['/home/me', 42]``. + +Let's take it out for a spin! + +.. code-block:: pycon + + >>> danger = [7, float('nan'), 22.7, '19', '-14', '59.123', 4] + >>> sorted(danger, key=lambda x: natsort_key(x, as_float=True, signed=True)) + [nan, '-14', 4, 7, '19', 22.7, '59.123'] + >>> + >>> paths = ['/p/Folder (1)/file.tar.gz', + ... '/p/Folder/file.tar.gz', + ... 123456] + >>> sorted(paths, key=lambda x: natsort_key(x, as_path=True)) + [123456, '/p/Folder/file.tar.gz', '/p/Folder (1)/file.tar.gz'] + +Here Be Dragons: Adding Locale Support +-------------------------------------- + +.. contents:: + :local: + +Probably the most challenging special case I had to handle was getting +:mod:`natsort` to handle sorting the non-numerical parts of input +correctly, and also allowing it to sort the numerical bits in different +locales. This was in no way what I originally set out to do with this +library, so I was `caught a bit off guard when the request was initially made`_. +I discovered the :mod:`locale` library, and assumed that if it's part of Python's +StdLib there can't be too many dragons, right? + +.. admonition:: INCOMPLETE LIST OF DRAGONS + + - https://github.com/SethMMorton/natsort/issues/21 + - https://github.com/SethMMorton/natsort/issues/22 + - https://github.com/SethMMorton/natsort/issues/23 + - https://github.com/SethMMorton/natsort/issues/36 + - https://github.com/SethMMorton/natsort/issues/44 + - https://bugs.python.org/issue2481 + - https://bugs.python.org/issue23195 + - https://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help + - https://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation + - https://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm + - https://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators + - https://stackoverflow.com/questions/45734562/how-can-i-get-a-reasonable-string-sorting-with-python + +These can be summed up as follows: + +#. :mod:`locale` is a thin wrapper over your operating system's *locale* + library, so if *that* is broken (like it is on BSD and OSX) then + :mod:`locale` is broken in Python. +#. Because of a bug in legacy Python (i.e. Python 2), there is no uniform way to use + the :mod:`locale` sorting functionality between legacy Python and Python 3. +#. People have differing opinions of how capitalization should affect word order. +#. There is no built-in way to handle locale-dependent thousands separators + and decimal points *robustly*. +#. Proper handling of Unicode is complicated. +#. Proper handling of :mod:`locale` is complicated. + +Easily over half of the the code in :mod:`natsort` is in some way dealing with some +aspect of :mod:`locale` or basic case handling. It would have been +impossible to get right without a `really good`_ `testing strategy`_. + +Don't expect any more TL;DR's... if you want to see how all this is fully +incorporated into the :mod:`natsort` algorithm then please take a look +`at the code`_. However, I will hint at how specific steps are taken in +each section. + +Let's see how we can handle some of the dragons, one-by-one. + +Basic Case Control Support +++++++++++++++++++++++++++ + +Without even thinking about the mess that is adding :mod:`locale` support, +:mod:`natsort` can introduce support for controlling how case is interpreted. + +First, let's take a look at how it is sorted by default (due to +where characters lie on the `ASCII table`_). + +.. code-block:: pycon + + >>> a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana'] + >>> sorted(a) + ['Apple', 'Banana', 'Corn', 'apple', 'banana', 'corn'] + +All uppercase letters come before lowercase letters in the `ASCII table`_, +so all capitalized words appear first. Not everyone agrees that this +is the correct order. Some believe that the capitalized words should +be last (``['apple', 'banana', 'corn', 'Apple', 'Banana', 'Corn']``). +Some believe that both the lowercase and uppercase versions +should appear together (``['Apple', 'apple', 'Banana', 'banana', 'Corn', 'corn']``). +Some believe that both should be true ☹. Some people don't care at all [#f4]_. + +Solving the first case (I call it *LOWERCASEFIRST*) is actually pretty +easy... just call the :meth:`str.swapcase` method on the input. + +.. code-block:: pycon + + >>> sorted(a, key=lambda x: x.swapcase()) + ['apple', 'banana', 'corn', 'Apple', 'Banana', 'Corn'] + +The last (i call it *IGNORECASE*) should be super easy, right? +Simply call :meth:`str.lowercase` on the input. This will work but may +not always give the correct answer on non-latin character sets. It's +a good thing that in Python 3.3 +:meth:`str.casefold` was introduced, which does a better job of removing +all case information from unicode characters in +non-latin alphabets. + +.. code-block:: pycon + + >>> def remove_case(x): + ... try: + ... return x.casefold() + ... except AttributeError: # Legacy Python backwards compatibility + ... return x.lowercase() + ... + >>> sorted(a, key=remove_case) + ['Apple', 'apple', 'Banana', 'banana', 'corn', 'Corn'] + +The middle case (I call it *GROUPLETTERS*) is less straightforward. +The most efficient way to handle this is to duplicate each character +with its lowercase version and then the original character. + +.. code-block:: pycon + + >>> import itertools + >>> def groupletters(x): + ... return ''.join(itertools.chain.from_iterable((remove_case(y), y) for y in x)) + ... + >>> groupletters('Apple') + 'aAppppllee' + >>> groupletters('apple') + 'aappppllee' + >>> sorted(a, key=groupletters) + ['Apple', 'apple', 'Banana', 'banana', 'Corn', 'corn'] + +The effect of this is that both ``'Apple'`` and ``'apple'`` are +placed adjacent to each other because their transformations both begin +with ``'a'``, and then the second character can be used to order them +appropriately with respect to each other. + +There's a problem with this, though. Within the context of :mod:`natsort` +we are trying to correctly sort numbers and those should be left alone. + +.. code-block:: pycon + + >>> a = ['Apple5', 'apple', 'Apple4E10', 'Banana'] + >>> sorted(a, key=lambda x: natsort_key(x, as_float=True)) + ['Apple5', 'Apple4E10', 'Banana', 'apple'] + >>> sorted(a, key=lambda x: natsort_key(groupletters(x), as_float=True)) + ['Apple4E10', 'Apple5', 'apple', 'Banana'] + >>> groupletters('Apple4E10') + 'aAppppllee44eE1100' + +We messed up the numbers! Looks like :func:`groupletters` needs to be applied +*after* the strings are broken into their components. I'm not going to show +how this is done here, but basically it requires applying the function in +the ``else:`` block of :func:`coerce_to_int`/:func:`coerce_to_float`. + +.. code-block:: pycon + + >>> better_groupletters = natsort_keygen(alg=ns.GROUPLETTERS | ns.REAL) + >>> better_groupletters('Apple4E10') + ('aAppppllee', 40000000000.0) + >>> sorted(a, key=better_groupletters) + ['Apple5', 'Apple4E10', 'apple', 'Banana'] + +Of course, applying both *LOWERCASEFIRST* and *GROUPLETTERS* is just +a matter of turning on both functions. + +Basic Unicode Support ++++++++++++++++++++++ + +Unicode is hard and complicated. Here's an example. + +.. code-block:: pycon + + >>> b = [b'\x66', b'\x65', b'\xc3\xa9', b'\x65\xcc\x81', b'\x61', b'\x7a'] + >>> a = [x.decode('utf8') for x in b] + >>> a # doctest: +SKIP + ['f', 'e', 'é', 'é', 'a', 'z'] + >>> sorted(a) # doctest: +SKIP + ['a', 'e', 'é', 'f', 'z', 'é'] + + +There are more than one way to represent the character 'é' in Unicode. +In fact, many characters have multiple representations. This is a challenge +because comparing the two representations would return ``False`` even though +they *look* the same. + +.. code-block:: pycon + + >>> a[2] == a[3] + False + +Alas, since characters are compared based on the numerical value of their +representation, sorting Unicode often gives unexpected results (like seeing +'é' come both *before* and *after* 'z'). + +The original approach that :mod:`natsort` took with respect to non-ASCII +Unicode characters was to say "just use +the :mod:`locale` or :mod:`PyICU` library" and then cross it's fingers +and hope those libraries take care of it. As you will find in the following +sections, that comes with its own baggage, and turned out to not always work anyway +(see https://stackoverflow.com/q/45734562/1399279). A more robust approach is to +handle the Unicode out-of-the-box without invoking a heavy-handed library +like :mod:`locale` or :mod:`PyICU`. To do this, we must use *normalization*. + +To fully understand Unicode normalization, `check out some official Unicode documentation`_. +Just kidding... that's too much text. The following StackOverflow answers do +a good job at explaining Unicode normalization in simple terms: +https://stackoverflow.com/a/7934397/1399279 and +https://stackoverflow.com/a/7931547/1399279. Put simply, normalization +ensures that Unicode characters with multiple representations are in +some canonical and consistent representation so that (for example) comparisons +of the characters can be performed in a sane way. The following discussion +assumes you at least read the StackOverflow answers. + +Looking back at our 'é' example, we can see that the two versions were +constructed with the byte strings ``b'\xc3\xa9'`` and ``b'\x65\xcc\x81'``. +The former representation is actually +`LATIN SMALL LETTER E WITH ACUTE `_ +and is a single character in the Unicode standard. This is known as the +*compressed form* and corresponds to the 'NFC' normalization scheme. +The latter representation is actually the letter 'e' followed by +`COMBINING ACUTE ACCENT `_ +and so is two characters in the Unicode standard. This is known as the +*decompressed form* and corresponds to the 'NFD' normalization scheme. +Since the first character in the decompressed form is actually the letter 'e', +when compared to other ASCII characters it fits where you might expect. +Unfortunately, all Unicode compressed form characters come after the +ASCII characters and so they always will be placed after 'z' when sorting. + +It seems that most Unicode data is stored and shared in the compressed form +which makes it challenging to sort. This can be solved by normalizing all +incoming Unicode data to the decompressed form ('NFD') and *then* sorting. + +.. code-block:: pycon + + >>> import unicodedata + >>> c = [unicodedata.normalize('NFD', x) for x in a] + >>> c # doctest: +SKIP + ['f', 'e', 'é', 'é', 'a', 'z'] + >>> sorted(c) # doctest: +SKIP + ['a', 'e', 'é', 'é', 'f', 'z'] + +Huzzah! Sane sorting without having to resort to :mod:`locale`! + +Using Locale to Compare Strings ++++++++++++++++++++++++++++++++ + +The :mod:`locale` module is actually pretty cool, and provides lowly +spare-time programmers like myself a way to handle the daunting task +of proper locale-dependent support of their libraries and utilities. +Having said that, it can be a bit of a bear to get right, +`although they do point out in the documentation that it will be painful to use`_. +Aside from the caveats spelled out in that link, it turns out that just +comparing strings with :mod:`locale` in a cross-platform and +cross-python-version manner is not as straightforward as one might hope. + +First, how to use :mod:`locale` to compare strings? It's actually +pretty straightforward. Simply run the input through the :mod:`locale` +transformation function :func:`locale.strxfrm`. + +.. code-block:: pycon + + >>> import locale, sys + >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') + 'en_US.UTF-8' + >>> a = ['a', 'b', 'ä'] + >>> sorted(a) + ['a', 'b', 'ä'] + >>> # The below fails on OSX, so don't run doctest on darwin. + >>> is_osx = sys.platform == 'darwin' + >>> sorted(a, key=locale.strxfrm) if not is_osx else ['a', 'ä', 'b'] + ['a', 'ä', 'b'] + >>> + >>> a = ['apple', 'Banana', 'banana', 'Apple'] + >>> sorted(a, key=locale.strxfrm) if not is_osx else ['apple', 'Apple', 'banana', 'Banana'] + ['apple', 'Apple', 'banana', 'Banana'] + +It turns out that locale-aware sorting groups numbers in the same +way as turning on *GROUPLETTERS* and *LOWERCASEFIRST*. +The trick is that you have to apply :func:`locale.strxfrm` only to non-numeric +characters; otherwise, numbers won't be parsed properly. Therefore, it must +be applied as part of the :func:`coerce_to_int`/:func:`coerce_to_float` +functions in a manner similar to :func:`groupletters`. + +As you might have guessed, there is a small problem. +It turns out the there is a bug in the legacy Python implementation of +:func:`locale.strxfrm` that causes it to outright fail for :func:`unicode` +input (https://bugs.python.org/issue2481). :func:`locale.strcoll` works, +but is intended for use with ``cmp``, which does not exist in current Python +implementations. Luckily, the :func:`functools.cmp_to_key` function +makes :func:`locale.strcoll` behave like :func:`locale.strxfrm`. + +Handling Broken Locale On OSX +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +But what if the underlying *locale* implementation that :mod:`locale` +relies upon is simply broken? It turns out that the *locale* library on +OSX (and other BSD systems) is broken (and for some reason has never been +fixed?), and so :mod:`locale` does not work as expected. + +How do I define doesn't work as expected? + +.. code-block:: pycon + + >>> a = ['apple', 'Banana', 'banana', 'Apple'] + >>> sorted(a) + ['Apple', 'Banana', 'apple', 'banana'] + >>> + >>> sorted(a, key=locale.strxfrm) if is_osx else sorted(a) + ['Apple', 'Banana', 'apple', 'banana'] + +IT'S SORTING AS IF :func:`locale.stfxfrm` WAS NEVER USED!! (and it's worse +once non-ASCII characters get thrown into the mix.) I'm really not +sure why this is considered OK for the OSX/BSD maintainers to not fix, +but it's more than frustrating for poor developers who have been dragged +into the *locale* game kicking and screaming. **. + +So, how to deal with this situation? There are two ways to do so. + +#. Detect if :mod:`locale` is sorting incorrectly (i.e. ``dumb``) by seeing + if ``'A'`` is sorted before ``'a'`` (incorrect) or not. + + .. code-block:: pycon + + >>> # This is genuinely the name of this function. + >>> # See natsort.compat.locale.py + >>> def dumb_sort(): + ... return locale.strxfrm('A') < locale.strxfrm('a') + ... + + If a ``dumb`` *locale* implementation is found, then automatically + turn on *LOWERCASEFIRST* and *GROUPLETTERS*. +#. Use an alternate library if installed. `ICU `_ + is a great and powerful library that has a pretty decent Python port + called (you guessed it) `PyICU `_. + If a user has this library installed on their computer, :mod:`natsort` + chooses to use that instead of :mod:`locale`. With a little bit of + planning, one can write a set of wrapper functions that call + the correct library under the hood such that the business logic never + has to know what library is being used (see `natsort.compat.locale.py`_). + +Let me tell you, this little complication really makes a challenge of testing +the code, since one must set up different environments on different operating +systems in order to test all possible code paths. Not to mention that +certain checks *will* fail for certain operating systems and environments +so one must be diligent in either writing the tests not to fail, or ignoring +those tests when on offending environments. + +Handling Locale-Aware Numbers ++++++++++++++++++++++++++++++ + +`Thousands separator support`_ is a problem that I knew would someday be +requested but had decided to push off until a rainy day. One day it finally +rained, and I decided to tackle the problem. + +So what is the problem? Consider the number ``1,234,567`` (assuming the +``','`` is the thousands separator). Try to run that through :func:`int` +and you will get a :exc:`ValueError`. To handle this properly the thousands +separators must be removed. + +.. code-block:: pycon + + >>> float('1,234,567'.replace(',', '')) + 1234567.0 + +What if, in our current locale, the thousands separator is ``'.'`` and +the ``','`` is the decimal separator (like for the German locale *de_DE*)? + +.. code-block:: pycon + + >>> float('1.234.567'.replace('.', '').replace(',', '.')) + 1234567.0 + >>> float('1.234.567,89'.replace('.', '').replace(',', '.')) + 1234567.89 + +This is pretty much what :func:`locale.atoi` and :func:`locale.atof` do +under the hood. So what's the problem? Why doesn't :mod:`natsort` just +use this method under its hood? +Well, let's take a look at what would happen if we send some possible +:mod:`natsort` input through our the above function: + +.. code-block:: pycon + + >>> natsort_key('1,234 apples, please.'.replace(',', '')) + ('', 1234, ' apples please.') + >>> natsort_key('Sir, €1.234,50 please.'.replace('.', '').replace(',', '.'), as_float=True) + ('Sir. €', 1234.5, ' please') + +Any character matching the thousands separator was dropped, and anything +matching the decimal separator was changed to ``'.'``! If these characters +were critical to how your data was ordered, this would break :mod:`natsort`. + +The first solution one might consider would be to first decompose the +input into sub-components (like we did for the *GROUPLETTERS* method +above) and then only apply these transformations on the number components. +This is a chicken-and-egg problem, though, because *we cannot appropriately +separate out the numbers because of the thousands separators and +non-'.' decimal separators* (well, at least not without making multiple +passes over the data which I do not consider to be a valid option). + +Regular expressions to the rescue! With regular expressions, we can +remove the thousands separators and change the decimal separator only +when they are actually within a number. Once the input has been +pre-processed with this regular expression, all the infrastructure +shown previously will work. + +Beware, these regular expressions will make your eyes bleed. + +.. code-block:: pycon + + >>> decimal = ',' # Assume German locale, so decimal separator is ',' + >>> # Look-behind assertions cannot accept range modifiers, so instead of i.e. + >>> # (?>> nodecimal = r'(?>> strip_thousands = r''' + ... (?<=[0-9]{{1}}) # At least 1 number + ... (?>> re.sub(strip_thousands, '', 'Sir, €1.234,50 please.', flags=re.X) + 'Sir, €1234,50 please.' + >>> + >>> # The decimal point must be preceded by a number or after + >>> # a number. This option only needs to be performed in the + >>> # case when the decimal separator for the locale is not '.'. + >>> switch_decimal = r'(?<=[0-9]){decimal}|{decimal}(?=[0-9])' + >>> switch_decimal = switch_decimal.format(decimal=decimal) + >>> re.sub(switch_decimal, '.', 'Sir, €1234,50 please.', flags=re.X) + 'Sir, €1234.50 please.' + >>> + >>> natsort_key('Sir, €1234.50 please.', as_float=True) + ('Sir, €', 1234.5, ' please.') + +Final Thoughts +-------------- + +My hope is that users of :mod:`natsort` never have to think about or worry +about all the bookkeeping or any of the details described above, and that using +:mod:`natsort` seems to magically "just work". For those of you who +took the time to read this engineering description, I hope it has enlightened +you to some of the issues that can be encountered when code is released +into the wild and has to accept "real-world data", or to what happens +to developers who naïvely make bold assumptions that are counter to +what the rest of the world assumes. + +.. rubric:: Footnotes + +.. [#f1] + To anyone looking through the actual code, you will note that I don't + actually use :mod:`pathlib` to split the paths... I wrote my own version + to avoid adding an external dependency of :mod:`pathlib` on Python < 3.4. +.. [#f2] + *"But if you hadn't removed the leading empty string from re.split this + wouldn't have happened!!"* I can hear you saying. Well, that's true. I don't + have a *great* reason for having done that except that in an earlier + non-optimal incarnation of the algorithm I needed to it, and it kind of + stuck, and it made other parts of the code easier if the assumption that + there were no empty strings was valid. +.. [#f3] + I'm not going to show how this is implemented in this document, + but if you are interested you can look at the code to + :func:`sep_inserter` in `util.py`_. +.. [#f4] + Handling each of these is straightforward, but coupled with the rapidly + fracturing execution paths presented in :ref:`TL;DR 2 ` one can imagine + this will get out of hand quickly. If you take a look at `natsort.py`_ and + `util.py`_ you can observe that to avoid this I take a more functional approach + to construting the :mod:`natsort` algorithm as opposed to the procedural approach + illustrated in :ref:`TL;DR 1 ` and :ref:`TL;DR 2 `. + +.. _ASCII table: https://www.asciitable.com/ +.. _getting sorting right is surprisingly hard: http://www.compciv.org/guides/python/fundamentals/sorting-collections-with-sorted/ +.. _This astonished: https://github.com/SethMMorton/natsort/issues/19 +.. _a lot: https://stackoverflow.com/questions/29548742/python-natsort-sort-strings-recursively +.. _of people: https://stackoverflow.com/questions/24045348/sort-set-of-numbers-in-the-form-xx-yy-in-python +.. _and some people aren't very nice when they are astonished: + https://github.com/xolox/python-naturalsort/blob/ed3e6b6ffaca3bdea3b76e08acbb8bd2a5fee463/README.rst#why-another-natsort-module +.. _fastnumbers: https://github.com/SethMMorton/fastnumbers +.. _as part of my testing: https://github.com/SethMMorton/natsort/blob/master/test_natsort/slow_splitters.py +.. _this one for coercion: https://stackoverflow.com/questions/736043/checking-if-a-string-can-be-converted-to-float-in-python +.. _this one for checking: https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float +.. _most natural sort solutions for python on Stack Overflow: https://stackoverflow.com/q/4836710/1399279 +.. _80%/20%: https://en.wikipedia.org/wiki/Pareto_principle +.. _The first major special case I encountered was sorting filesystem paths: https://github.com/SethMMorton/natsort/issues/3 +.. _The second major special case I encountered was sorting of different types: https://github.com/SethMMorton/natsort/issues/7 +.. _A rather unexpected special case I encountered was sorting collections containing NaN: + https://github.com/SethMMorton/natsort/issues/27 +.. _It's hard to compare floating point numbers: http://www.drdobbs.com/cpp/its-hard-to-compare-floating-point-numbe/240149806 +.. _caught a bit off guard when the request was initially made: https://github.com/SethMMorton/natsort/issues/14 +.. _at the code: https://github.com/SethMMorton/natsort/tree/master/natsort +.. _natsort.py: https://github.com/SethMMorton/natsort/blob/master/natsort/natsort.py +.. _util.py: https://github.com/SethMMorton/natsort/blob/master/natsort/util.py +.. _although they do point out in the documentation that it will be painful to use: + https://docs.python.org/3/library/locale.html#background-details-hints-tips-and-caveats +.. _natsort.compat.locale.py: https://github.com/SethMMorton/natsort/blob/master/natsort/compat/locale.py +.. _Thousands separator support: https://github.com/SethMMorton/natsort/issues/36 +.. _really good: https://hypothesis.readthedocs.io/en/latest/ +.. _testing strategy: https://docs.pytest.org/en/latest/ +.. _check out some official Unicode documentation: https://unicode.org/reports/tr15/ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..4c03f3b --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,28 @@ +.. natsort documentation master file, created by + sphinx-quickstart on Thu Jul 17 21:01:29 2014. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +natsort: Simple yet flexible natural sorting in Python. +======================================================= + +Contents: + +.. toctree:: + :maxdepth: 2 + :numbered: + + intro.rst + howitworks.rst + examples.rst + api.rst + locale_issues.rst + shell.rst + changelog.rst + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/intro.rst b/docs/intro.rst new file mode 100644 index 0000000..9afbd21 --- /dev/null +++ b/docs/intro.rst @@ -0,0 +1,469 @@ +.. default-domain:: py +.. module:: natsort + +The :mod:`natsort` module +========================= + +Simple yet flexible natural sorting in Python. + + - Source Code: https://github.com/SethMMorton/natsort + - Downloads: https://pypi.org/project/natsort/ + - Documentation: https://natsort.readthedocs.io/ + - Optional Dependencies: + + - `fastnumbers `_ >= 2.0.0 + - `PyICU `_ >= 1.0.0 + +**NOTE**: Please see the `Deprecation Schedule`_ section for changes in +:mod:`natsort` version 6.0.0 and in the upcoming version 7.0.0. + +:mod:`natsort` is a general utility for sorting lists *naturally*; the definition +of "naturally" is not well-defined, but the most common definition is that numbers +contained within the string should be sorted as numbers and not as you would +other characters. If you need to present sorted output to a user, you probably +want to sort it naturally. + +:mod:`natsort` was initially created for sorting scientific output filenames that +contained signed floating point numbers in the names. There was a lack of +algorithms out there that could perform a natural sort on `floats` but +plenty for `ints`; check out +`this StackOverflow question `_ +and its answers and links therein, +`this ActiveState forum `_, +and of course `this great article on natural sorting `_ +from CodingHorror.com for examples of what I mean. +:mod:`natsort` was created to fill in this gap, but has since expanded to handle +just about any definition of a number, as well as other sorting customizations. + +Quick Description +----------------- + +When you try to sort a list of strings that contain numbers, the normal python +sort algorithm sorts lexicographically, so you might not get the results that you +expect: + +.. code-block:: pycon + + >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] + >>> sorted(a) + ['1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '2 ft 7 in', '7 ft 6 in'] + +Notice that it has the order ('1', '10', '2') - this is because the list is +being sorted in lexicographical order, which sorts numbers like you would +letters (i.e. 'b', 'ba', 'c'). + +:mod:`natsort` provides a function :func:`~natsorted` that helps sort lists +"naturally" ("naturally" is rather ill-defined, but in general it means +sorting based on meaning and not computer code point).. +Using :func:`~natsorted` is simple: + +.. code-block:: pycon + + >>> from natsort import natsorted + >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] + >>> natsorted(a) + ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] + +:func:`~natsorted` identifies numbers anywhere in a string and sorts them +naturally. Below are some other things you can do with :mod:`natsort` +(please see the :ref:`examples` for a quick start guide, or the :ref:`api` +for more details). + +.. note:: + + :func:`~natsorted` is designed to be a drop-in replacement for the built-in + :func:`sorted` function. Like :func:`sorted`, :func:`~natsorted` + `does not sort in-place`. To sort a list and assign the output to the + same variable, you must explicitly assign the output to a variable: + + .. code-block:: pycon + + >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] + >>> natsorted(a) + ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] + >>> print(a) # 'a' was not sorted; "natsorted" simply returned a sorted list + ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] + >>> a = natsorted(a) # Now 'a' will be sorted because the sorted list was assigned to 'a' + >>> print(a) + ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] + + Please see `Generating a Reusable Sorting Key and Sorting In-Place`_ for + an alternate way to sort in-place naturally. + +Examples +-------- + +Sorting Versions +++++++++++++++++ + +:mod:`natsort` does not (and never has) actually *comprehend* version numbers. +It just so happens that the most common versioning schemes are designed to +work with standard natural sorting techniques; these schemes include +``MAJOR.MINOR``, ``MAJOR.MINOR.PATCH``, ``YEAR.MONTH.DAY``. If your data +conforms to a scheme like this, then it will work out-of-the-box with +``natsorted`` (as of ``natsort`` version >= 4.0.0): + +.. code-block:: pycon + + >>> a = ['version-1.9', 'version-2.0', 'version-1.11', 'version-1.10'] + >>> natsorted(a) + ['version-1.9', 'version-1.10', 'version-1.11', 'version-2.0'] + +If you need to versions that use a more complicated scheme, please see +:ref:`rc_sorting` for examples. + +Sorting by Real Numbers (i.e. Signed Floats) +++++++++++++++++++++++++++++++++++++++++++++ + +This is useful in scientific data analysis and was +the default behavior of :func:`~natsorted` for :mod:`natsort` +version < 4.0.0. Use the :func:`~realsorted` function: + +.. code-block:: pycon + + >>> from natsort import realsorted, ns + >>> # Note that when interpreting as signed floats, the below numbers are + >>> # +5.10, -3.00, +5.30, +2.00 + >>> a = ['position5.10.data', 'position-3.data', 'position5.3.data', 'position2.data'] + >>> natsorted(a) + ['position2.data', 'position5.3.data', 'position5.10.data', 'position-3.data'] + >>> natsorted(a, alg=ns.REAL) + ['position-3.data', 'position2.data', 'position5.10.data', 'position5.3.data'] + >>> realsorted(a) # shortcut for natsorted with alg=ns.REAL + ['position-3.data', 'position2.data', 'position5.10.data', 'position5.3.data'] + +Locale-Aware Sorting (or "Human Sorting") ++++++++++++++++++++++++++++++++++++++++++ + +This is where the non-numeric characters are ordered based on their meaning, +not on their ordinal value, and a locale-dependent thousands separator and decimal +separator is accounted for in the number. +This can be achieved with the :func:`~humansorted` function: + +.. code-block:: pycon + + >>> a = ['Apple', 'apple15', 'Banana', 'apple14,689', 'banana'] + >>> natsorted(a) + ['Apple', 'Banana', 'apple14,689', 'apple15', 'banana'] + >>> import locale + >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') + 'en_US.UTF-8' + >>> natsorted(a, alg=ns.LOCALE) + ['apple15', 'apple14,689', 'Apple', 'banana', 'Banana'] + >>> from natsort import humansorted + >>> humansorted(a) + ['apple15', 'apple14,689', 'Apple', 'banana', 'Banana'] + +You may find you need to explicitly set the locale to get this to work +(as shown in the example). +Please see :ref:`locale_issues` and the Installation section +below before using the :func:`~humansorted` function. + +Further Customizing Natsort ++++++++++++++++++++++++++++ + +If you need to combine multiple algorithm modifiers (such as ``ns.REAL``, +``ns.LOCALE``, and ``ns.IGNORECASE``), you can combine the options using the +bitwise OR operator (``|``). For example, + +.. code-block:: pycon + + >>> a = ['Apple', 'apple15', 'Banana', 'apple14,689', 'banana'] + >>> natsorted(a, alg=ns.REAL | ns.LOCALE | ns.IGNORECASE) + ['Apple', 'apple15', 'apple14,689', 'Banana', 'banana'] + >>> # The ns enum provides long and short forms for each option. + >>> ns.LOCALE == ns.L + True + >>> # You can also customize the convenience functions, too. + >>> natsorted(a, alg=ns.REAL | ns.LOCALE | ns.IGNORECASE) == realsorted(a, alg=ns.L | ns.IC) + True + >>> natsorted(a, alg=ns.REAL | ns.LOCALE | ns.IGNORECASE) == humansorted(a, alg=ns.R | ns.IC) + True + +All of the available customizations can be found in the documentation for +the :class:`~natsort.ns` enum. + +You can also add your own custom transformation functions with the ``key`` argument. +These can be used with ``alg`` if you wish: + +.. code-block:: pycon + + >>> a = ['apple2.50', '2.3apple'] + >>> natsorted(a, key=lambda x: x.replace('apple', ''), alg=ns.REAL) + ['2.3apple', 'apple2.50'] + +Sorting Mixed Types ++++++++++++++++++++ + +You can mix and match ``int``, ``float``, and ``str`` (or ``unicode``) types +when you sort: + +.. code-block:: pycon + + >>> a = ['4.5', 6, 2.0, '5', 'a'] + >>> natsorted(a) + [2.0, '4.5', '5', 6, 'a'] + >>> # On Python 2, sorted(a) would return [2.0, 6, '4.5', '5', 'a'] + >>> # On Python 3, sorted(a) would raise an "unorderable types" TypeError + +Handling Bytes on Python 3 +++++++++++++++++++++++++++ + +:mod:`natsort` does not officially support the `bytes` type on Python 3, but +convenience functions are provided that help you decode to `str` first: + +.. code-block:: pycon + + >>> from natsort import as_utf8 + >>> a = [b'a', 14.0, 'b'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str()) + >>> natsorted(a, key=as_utf8) == [14.0, b'a', 'b'] + True + >>> a = [b'a56', b'a5', b'a6', b'a40'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would return the same results as sorted(a) + >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56'] + True + +Generating a Reusable Sorting Key and Sorting In-Place +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Under the hood, :func:`~natsorted` works by generating a custom sorting +key using :func:`~natsort_keygen` and then passes that to the built-in +:func:`sorted`. You can use the :func:`~natsort_keygen` function yourself to +generate a custom sorting key to sort in-place using the :meth:`list.sort` +method. + +.. code-block:: pycon + + >>> from natsort import natsort_keygen + >>> natsort_key = natsort_keygen() + >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] + >>> natsorted(a) == sorted(a, key=natsort_key) + True + >>> a.sort(key=natsort_key) + >>> a + ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] + +All of the algorithm customizations mentioned in the `Further Customizing Natsort`_ +section can also be applied to :func:`~natsort_keygen` through the *alg* keyword option. + +Other Useful Things ++++++++++++++++++++ + + - recursively descend into lists of lists + - automatic unicode normalization of input data + - controlling the case-sensitivity (see :ref:`case_sort`) + - sorting file paths correctly (see :ref:`path_sort`) + - allow custom sorting keys (see :ref:`custom_sort`) + +FAQ +--- + +How do I debug :func:`~natsorted`? + The best way to debug :func:`~natsorted` is to generate a key using :func:`~natsort_keygen` + with the same options being passed to :func:`~natsorted`. One can take a look at + exactly what is being done with their input using this key - it is highly recommended + to `look at this issue describing how to debug `_ + for *how* to debug, and also to review the :ref:`howitworks` page for *why* + :mod:`natsort` is doing that to your data. + + If you are trying to sort custom classes and running into trouble, please take a look at + https://github.com/SethMMorton/natsort/issues/60. In short, + custom classes are not likely to be sorted correctly if one relies + on the behavior of ``__lt__`` and the other rich comparison operators in their + custom class - it is better to use a ``key`` function with :mod:`natsort`, or + use the :mod:`natsort` key as part of your rich comparison operator definition. + +How *does* :mod:`natsort` work? + If you don't want to read :ref:`howitworks`, here is a quick primer. + + :mod:`natsort` provides a :term:`key function` that can be passed to + :meth:`list.sort` or :func:`sorted` in order to modify the default sorting + behavior. This key is generated on-demand with the key generator + :func:`natsort.natsort_keygen`. :func:`natsort.natsorted` is essentially a + wrapper for the following code: + + .. code-block:: pycon + + >>> from natsort import natsort_keygen + >>> natsort_key = natsort_keygen() + >>> sorted(['1', '10', '2'], key=natsort_key) + ['1', '2', '10'] + + Users can further customize :mod:`natsort` sorting behavior with the ``key`` + and/or ``alg`` options (see details in the `Further Customizing Natsort`_ + section). + + The key generated by :func:`natsort.natsort_keygen` *always* returns a :class:`tuple`. It + does so in the following way (*some details omitted for clarity*): + + 1. Assume the input is a string, and attempt to split it into numbers and + non-numbers using regular expressions. Numbers are then converted into + either :class:`int` or :class:`float`. + 2. If the above fails because the input is not a string, assume the input + is some other sequence (e.g. :class:`list` or :class:`tuple`), and recursively + apply the key to each element of the sequence. + 3. If the above fails because the input is not iterable, assume the input + is an :class:`int` or :class:`float`, and just return the input in a :class:`tuple`. + + Because a :class:`tuple` is always returned, a :exc:`TypeError` should not be common + unless one tries to do something odd like sort an :class:`int` against a :class:`list`. + +:mod:`natsort` gave me results I didn't expect, and it's a terrible library! + Did you try to debug using the above advice? If so, and you still cannot figure out + the error, then please `file an issue `_. + +Shell script +------------ + +:mod:`natsort` comes with a shell script called :mod:`natsort`, or can also be called +from the command line with ``python -m natsort``. + +Requirements +------------ + +:mod:`natsort` requires Python version 2.7 or Python 3.4 or greater. + +Optional Dependencies +--------------------- + +fastnumbers ++++++++++++ + +The most efficient sorting can occur if you install the +`fastnumbers `_ package +(version >=2.0.0); it helps with the string to number conversions. +:mod:`natsort` will still run (efficiently) without the package, but if you need +to squeeze out that extra juice it is recommended you include this as a dependency. +:mod:`natsort` will not require (or check) that +`fastnumbers `_ is installed +at installation. + +PyICU ++++++ + +It is recommended that you install `PyICU `_ +if you wish to sort in a locale-dependent manner, see :ref:`locale_issues` for +an explanation why. + +Installation +------------ + +Use ``pip``! + +.. code-block:: sh + + $ pip install natsort + +If you want to install the `Optional Dependencies`_, you can use the +`"extras" notation `_ +at installation time to install those dependencies as well - use ``fast`` for +`fastnumbers `_ and ``icu`` for +`PyICU `_. + +.. code-block:: sh + + # Install both optional dependencies. + $ pip install natsort[fast,icu] + # Install just fastnumbers + $ pip install natsort[fast] + +How to Run Tests +---------------- + +Please note that :mod:`natsort` is NOT set-up to support ``python setup.py test``. + +The recommended way to run tests is with `tox `_. +After installing ``tox``, running tests is as simple as executing the following in the +``natsort`` directory: + +.. code-block:: sh + + $ tox + +``tox`` will create virtual a virtual environment for your tests and install all the +needed testing requirements for you. You can specify a particular python version +with the ``-e`` flag, e.g. ``tox -e py36``. Static analysis is done with ``tox -e flake8``. +You can see all available testing environments with ``tox --listenvs``. + +If you do not wish to use ``tox``, you can install the testing dependencies with the +``dev-requirements.txt`` file and then run the tests manually using +`pytest `_. + +.. code-block:: console + + $ pip install -r dev-requirements.txt + $ python -m pytest + +Note that above I invoked ``python -m pytest`` instead of just ``pytest`` - this is because +`the former puts the CWD on sys.path `_. + +How to Build Documentation +-------------------------- + +If you want to build the documentation for :mod:`natsort`, it is recommended to use ``tox``: + +.. code-block:: console + + $ tox -e docs + +This will place the documentation in ``build/sphinx/html``. If you do not +which to use ``tox``, you can do the following: + +.. code-block:: console + + $ pip install sphinx sphinx_rtd_theme + $ python setup.py build_sphinx + +Deprecation Schedule +-------------------- + +Dropping Python 2.7 Support ++++++++++++++++++++++++++++ + +:mod:`natsort` version 7.0.0 will drop support for Python 2.7. + +The version 6.X branch will remain as a "long term support" branch where bug fixes +are applied so that users who cannot update from Python 2.7 will not be forced to +use a buggy :mod:`natsort` version. Once version 7.0.0 is released, new features +will not be added to version 6.X, only bug fixes. + +Deprecated APIs ++++++++++++++++ + +In :mod:`natsort` version 6.0.0, the following APIs and functions were removed + + - ``number_type`` keyword argument (deprecated since 3.4.0) + - ``signed`` keyword argument (deprecated since 3.4.0) + - ``exp`` keyword argument (deprecated since 3.4.0) + - ``as_path`` keyword argument (deprecated since 3.4.0) + - ``py3_safe`` keyword argument (deprecated since 3.4.0) + - ``ns.TYPESAFE`` (deprecated since version 5.0.0) + - ``ns.DIGIT`` (deprecated since version 5.0.0) + - ``ns.VERSION`` (deprecated since version 5.0.0) + - :func:`~natsort.versorted` (discouraged since version 4.0.0, officially deprecated since version 5.5.0) + - :func:`~natsort.index_versorted` (discouraged since version 4.0.0, officially deprecated since version 5.5.0) + +In general, if you want to determine if you are using deprecated APIs you can run your +code with the following flag + +.. code-block:: console + + $ python -Wdefault::DeprecationWarning my-code.py + +By default :exc:`DeprecationWarnings` are not shown, but this will cause them to be shown. +Alternatively, you can just set the environment variable ``PYTHONWARNINGS`` to +"default::DeprecationWarning" and then run your code. + +Dropped Pipenv for Development +++++++++++++++++++++++++++++++ + +:mod:`natsort` version 6.0.0 no longer uses `Pipenv `_ +to install development dependencies. + +Dropped Python 2.6 and 3.3 Support +++++++++++++++++++++++++++++++++++ + +:mod:`natsort` version 6.0.0 dropped support for Python 2.6 and Python 3.3. diff --git a/docs/locale_issues.rst b/docs/locale_issues.rst new file mode 100644 index 0000000..88cf3b8 --- /dev/null +++ b/docs/locale_issues.rst @@ -0,0 +1,97 @@ +.. default-domain:: py +.. currentmodule:: natsort + +.. _locale_issues: + +Possible Issues with :func:`~natsort.humansorted` or ``ns.LOCALE`` +================================================================== + +Being Locale-Aware Means Both Numbers and Non-Numbers +----------------------------------------------------- + +In addition to modifying how characters are sorted, ``ns.LOCALE`` will take into +account locale-dependent thousands separators (and locale-dependent decimal +separators if ``ns.FLOAT`` is enabled). This means that if you are in a +locale that uses commas as the thousands separator, a number like +``123,456`` will be interpreted as ``123456``. If this is not what you want, +you may consider using ``ns.LOCALEALPHA`` which will only enable locale-aware +sorting for non-numbers (similarly, ``ns.LOCALENUM`` enables locale-aware +sorting only for numbers). + +Regenerate Key With :func:`~natsort.natsort_keygen` After Changing Locale +------------------------------------------------------------------------- + +When :func:`~natsort.natsort_keygen` is called it returns a key function that +hard-codes the provided settings. This means that the key returned when +``ns.LOCALE`` is used contins the settings specifed by the locale +*loaded at the time the key is generated*. If you change the locale, +you should regenerate the key to account for the new locale. + +Corollary: Do Not Reuse :func:`~natsort.natsort_keygen` After Changing Locale ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +If you change locale, the old function will not work as expected. +The :mod:`locale` library works with a global state. When +:func:`~natsort.natsort_keygen` is called it does the best job that it can to +make the returned function as static as possible and independent of the global +state, but the :func:`locale.strxfrm` function must access this global state to +work; therefore, if you change locale and use ``ns.LOCALE`` then you should +discard the old key. + +.. note:: If you use `PyICU`_ then you may be able to reuse keys after changing + locale. + +The :mod:`locale` Module From the StdLib Has Issues +--------------------------------------------------- + +:mod:`natsort` will use `PyICU`_ for :func:`~natsort.humansorted` or +``ns.LOCALE`` if it is installed. If not, it will fall back on the +:mod:`locale` library from the Python stdlib. If you do not have `PyICU`_ +installed, please keep the following known problems and issues in mind. + +.. note:: Remember, if you have `PyICU`_ installed you shouldn't need to worry + about any of these. + +Explicitly Set the Locale Before Using :func:`~natsort.humansorted` or ``ns.LOCALE`` +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +I have found that unless you explicitly set a locale, the sorted order may not +be what you expect. Setting this is straightforward +(in the below example I use 'en_US.UTF-8', but you should use your +locale): + +.. code-block:: pycon + + >>> import locale + >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') + 'en_US.UTF-8' + +.. _bug_note: + +The :mod:`locale` Module Is Broken on Mac OS X +++++++++++++++++++++++++++++++++++++++++++++++ + +It's not Python's fault, but the OS... the locale library for BSD-based systems +(of which Mac OS X is one) is broken. See the following links: + + - https://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help + - https://bugs.python.org/issue23195 + - https://github.com/SethMMorton/natsort/issues/21 (contains instructons on installing) + - https://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm + - https://github.com/SethMMorton/natsort/issues/34 + +Of course, installing `PyICU`_ fixes this, but if you don't want to or cannot +install this there is some hope. + + 1. As of ``natsort`` version 4.0.0, ``natsort`` is configured + to compensate for a broken ``locale`` library. When sorting non-numbers + it will handle case as you expect, but it will still not be able to + comprehend non-ASCII characters properly. Additionally, it has + a built-in lookup table of thousands separators that are incorrect + on OS X/BSD (but is possible it is not complete... please file an + issue if you see it is not complete) + 2. Use "\*.ISO8859-1" locale (i.e. 'en_US.ISO8859-1') rather than "\*.UTF-8" + locale. I have found that these have fewer issues than "UTF-8", but + your mileage may vary. + +.. _PyICU: https://pypi.org/project/PyICU diff --git a/docs/shell.rst b/docs/shell.rst new file mode 100644 index 0000000..8a17ccc --- /dev/null +++ b/docs/shell.rst @@ -0,0 +1,158 @@ +.. default-domain:: py +.. currentmodule:: natsort + +.. _shell: + +Shell Script +============ + +The ``natsort`` shell script is automatically installed when you install +:mod:`natsort` with pip. + +Below is the usage and some usage examples for the ``natsort`` shell script. + +Usage +----- + +.. code-block:: none + + usage: natsort [-h] [--version] [-p] [-f LOW HIGH] [-F LOW HIGH] [-e EXCLUDE] + [-r] [-t {digit,int,float,version,ver}] [--nosign] [--noexp] + [--locale] + [entries [entries ...]] + + Performs a natural sort on entries given on the command-line. + A natural sort sorts numerically then alphabetically, and will sort + by numbers in the middle of an entry. + + positional arguments: + entries The entries to sort. Taken from stdin if nothing is + given on the command line. + + optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit + -p, --paths Interpret the input as file paths. This is not + strictly necessary to sort all file paths, but in + cases where there are OS-generated file paths like + "Folder/" and "Folder (1)/", this option is needed to + make the paths sorted in the order you expect + ("Folder/" before "Folder (1)/"). + -f LOW HIGH, --filter LOW HIGH + Used for keeping only the entries that have a number + falling in the given range. + -F LOW HIGH, --reverse-filter LOW HIGH + Used for excluding the entries that have a number + falling in the given range. + -e EXCLUDE, --exclude EXCLUDE + Used to exclude an entry that contains a specific + number. + -r, --reverse Returns in reversed order. + -t {digit,int,float,version,ver,real,f,i,r,d}, + --number-type {digit,int,float,version,ver,real,f,i,r,d}, + --number_type {digit,int,float,version,ver,real,f,i,r,d} + Choose the type of number to search for. "float" will + search for floating-point numbers. "int" will only + search for integers. "digit", "version", and "ver" are + synonyms for "int"."real" is a shortcut for "float" + with --sign. "i" and "d" are synonyms for "int", "f" + is a synonym for "float", and "r" is a synonym for + "real".The default is int. + --nosign Do not consider "+" or "-" as part of a number, i.e. + do not take sign into consideration. This is the + default. + -s, --sign Consider "+" or "-" as part of a number, i.e. take + sign into consideration. The default is unsigned. + --noexp Do not consider an exponential as part of a number, + i.e. 1e4, would be considered as 1, "e", and 4, not as + 10000. This only effects the --number-type=float. + -l, --locale Causes natsort to use locale-aware sorting. You will + get the best results if you install PyICU. + +Description +----------- + +``natsort`` was originally written to aid in computational chemistry +research so that it would be easy to analyze large sets of output files +named after the parameter used: + +.. code-block:: console + + $ ls *.out + mode1000.35.out mode1243.34.out mode744.43.out mode943.54.out + +(Obviously, in reality there would be more files, but you get the idea.) Notice +that the shell sorts in lexicographical order. This is the behavior of programs like +``find`` as well as ``ls``. The problem is passing these files to an +analysis program causes them not to appear in numerical order, which can lead +to bad analysis. To remedy this, use ``natsort``: + +.. code-block:: console + + $ natsort *.out + mode744.43.out + mode943.54.out + mode1000.35.out + mode1243.34.out + $ natsort -t r *.out | xargs your_program + +``-t r`` is short for ``--number-type real``. You can also place natsort in +the middle of a pipe: + +.. code-block:: console + + $ find . -name "*.out" | natsort -t r | xargs your_program + +To sort version numbers, use the default ``--number-type``: + +.. code-block:: console + + $ ls * + prog-1.10.zip prog-1.9.zip prog-2.0.zip + $ natsort * + prog-1.9.zip + prog-1.10.zip + prog-2.0.zip + +In general, all ``natsort`` shell script options mirror the :func:`~natsorted` API, +with notable exception of the ``--filter``, ``--reverse-filter``, and ``--exclude`` +options. These three options are used as follows: + +.. code-block:: console + + $ ls *.out + mode1000.35.out mode1243.34.out mode744.43.out mode943.54.out + $ natsort -t r *.out -f 900 1100 # Select only numbers between 900-1100 + mode943.54.out + mode1000.35.out + $ natsort -t r *.out -F 900 1100 # Select only numbers NOT between 900-1100 + mode744.43.out + mode1243.34.out + $ natsort -t r *.out -e 1000.35 # Exclude 1000.35 from search + mode744.43.out + mode943.54.out + mode1243.34.out + +If you are sorting paths with OS-generated filenames, you may require the +``--paths``/``-p`` option: + +.. code-block:: console + + $ find . ! -path . -type f + ./folder/file (1).txt + ./folder/file.txt + ./folder (1)/file.txt + ./folder (10)/file.txt + ./folder (2)/file.txt + $ find . ! -path . -type f | natsort + ./folder (1)/file.txt + ./folder (2)/file.txt + ./folder (10)/file.txt + ./folder/file (1).txt + ./folder/file.txt + $ find . ! -path . -type f | natsort -p + ./folder/file.txt + ./folder/file (1).txt + ./folder (1)/file.txt + ./folder (2)/file.txt + ./folder (10)/file.txt diff --git a/docs/source/api.rst b/docs/source/api.rst deleted file mode 100644 index 528a5e7..0000000 --- a/docs/source/api.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -.. _api: - -natsort API -=========== - -.. toctree:: - :maxdepth: 2 - - natsort_keygen.rst - natsort_key.rst - natsorted.rst - versorted.rst - humansorted.rst - realsorted.rst - index_natsorted.rst - index_versorted.rst - index_humansorted.rst - index_realsorted.rst - order_by_index.rst - ns_class.rst - bytes.rst - chain.rst - locale_issues.rst diff --git a/docs/source/bytes.rst b/docs/source/bytes.rst deleted file mode 100644 index c59d4ad..0000000 --- a/docs/source/bytes.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -.. _bytes_help: - -Help With Bytes On Python 3 -=========================== - -The official stance of :mod:`natsort` is to not support `bytes` for -sorting; there is just too much that can go wrong when trying to automate -conversion between `bytes` and `str`. But rather than completely give up -on `bytes`, :mod:`natsort` provides three functions that make it easy to -quickly decode `bytes` to `str` so that sorting is possible. - -.. autofunction:: decoder - -.. autofunction:: as_ascii - -.. autofunction:: as_utf8 - diff --git a/docs/source/chain.rst b/docs/source/chain.rst deleted file mode 100644 index 5f59706..0000000 --- a/docs/source/chain.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -.. _function_help: - -Help With Creating Function Keys -================================ - -If you need to create a complicated *key* argument to (for example) -:func:`natsorted` that is actually multiple functions called one after the other, -the following function can help you easily perform this action. It is -used internally to :mod:`natsort`, and has been exposed publically for -the convenience of the user. - -.. autofunction:: chain_functions - diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst deleted file mode 100644 index c9a8906..0000000 --- a/docs/source/changelog.rst +++ /dev/null @@ -1,369 +0,0 @@ -.. _changelog: - -Changelog ---------- - -09-09-2018 v. 5.4.1 -+++++++++++++++++++ - - - Fix error in a newly added test. - - Changed code format and quality checking infrastructure. - -09-06-2018 v. 5.4.0 -+++++++++++++++++++ - - - Re-expose ``natsort_key`` as "public" and remove the - associated ``DepricationWarning``. - - Add better developer documentation. - - Refactor tests. - - Bump allowed ``fastnumbers`` version. - -07-07-2018 v. 5.3.3 -+++++++++++++++++++ - - - Update docs with a FAQ and quick how-it-works. - - Fix a StopIteration error in the testing code. - - Enable Python 3.7 support in Travis-CI. - -05-17-2018 v. 5.3.2 -+++++++++++++++++++ - - - Fix bug that prevented install on old versions of setuptools. - - Revert layout from src/natsort/ back to natsort/ to make user - testing simpler. - -05-14-2018 v. 5.3.1 -+++++++++++++++++++ - - - No bugfixes or features, just infrastructure and installation updates. - - Move to defining dependencies with Pipfile. - - Development layout is now src/natsort/ instead of natsort/. - - Add bumpversion infrastructure. - - Extras can be installed by "[]" notation. - -04-20-2018 v. 5.3.0 -+++++++++++++++++++ - - - Fix bug in assessing ``fastnumbers`` version at import-time. - - Add ability to consider unicode-decimal numbers as numbers. - -02-14-2018 v. 5.2.0 -+++++++++++++++++++ - - - Add ``ns.NUMAFTER`` to cause numbers to be placed after non-numbers. - - Add ``natcmp`` function (Python 2 only). - -11-11-2017 v. 5.1.1 -+++++++++++++++++++ - - - Added additional unicode number support for Python 3.7. - - Added information on how to install and test. - -08-19-2017 v. 5.1.0 -+++++++++++++++++++ - - - Fixed ``StopIteration`` warning on Python 3.6+. - - All Unicode input is now normalized. - -04-30-2017 v. 5.0.3 -+++++++++++++++++++ - - - Improved development infrastructure. - - Migrated documentation to ReadTheDocs. - -01-02-2017 v. 5.0.2 -+++++++++++++++++++ - - - Added additional unicode number support for Python 3.6. - - Renamed several internal functions and variables to improve clarity. - - Improved documentation examples. - - Added a "how does it work?" section to the documentation. - -06-04-2016 v. 5.0.1 -+++++++++++++++++++ - - - The ``ns`` enum attributes can now be imported from the top-level - namespace. - - Fixed a bug with the ``from natsort import *`` mechanism. - - Fixed bug with using ``natsort`` with ``python -OO``. - -05-08-2016 v. 5.0.0 -+++++++++++++++++++ - - - ``ns.LOCALE``/``humansorted`` now accounts for thousands separators. - - Refactored entire codebase to be more functional (as in use functions as - units). Previously, the code was rather monolithic and difficult to follow. The - goal is that with the code existing in smaller units, contributing will - be easier. - - Deprecated ``ns.TYPESAFE`` option as it is now always on (due to a new - iterator-based algorithm, the typesafe function is now cheap). - - Increased speed of execution (came for free with the new functional approach - because the new factory function paradigm eliminates most ``if`` branches - during execution). - - - For the most cases, the code is 30-40% faster than version 4.0.4. - - If using ``ns.LOCALE`` or ``humansorted``, the code is 1100% faster than - version 4.0.4. - - - Improved clarity of documentaion with regards to locale-aware sorting. - - Added a new ``chain_functions`` function for convenience in creating - a complex user-given ``key`` from several existing functions. - -11-01-2015 v. 4.0.4 -+++++++++++++++++++ - - - Improved coverage of unit tests. - - Unit tests use new and improved hypothesis library. - - Fixed compatibility issues with Python 3.5 - -06-25-2015 v. 4.0.3 -+++++++++++++++++++ - - - Fixed bad install on last release (sorry guys!). - -06-24-2015 v. 4.0.2 -+++++++++++++++++++ - - - Added back Python 2.6 and Python 3.2 compatibility. Unit testing is now - performed for these versions. - - Consolidated under-the-hood compatibility functionality. - -06-04-2015 v. 4.0.1 -+++++++++++++++++++ - - - Added support for sorting NaN by internally converting to -Infinity - or +Infinity - -05-17-2015 v. 4.0.0 -+++++++++++++++++++ - - - Made default behavior of 'natsort' search for unsigned ints, - rather than signed floats. This is a backwards-incompatible - change but in 99% of use cases it should not require any - end-user changes. - - Improved handling of locale-aware sorting on systems where the - underlying locale library is broken. - - Greatly improved all unit tests by adding the hypothesis library. - -04-06-2015 v. 3.5.6 -+++++++++++++++++++ - - - Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior of - an ordinal sort when using 'LOCALE'. - - Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for - dealing with bytes types. - -04-04-2015 v. 3.5.5 -+++++++++++++++++++ - - - Added 'realsorted' and 'index_realsorted' functions for - forward-compatibility with >= 4.0.0. - - Made explanation of when to use "TYPESAFE" more clear in the docs. - -04-02-2015 v. 3.5.4 -+++++++++++++++++++ - - - Fixed bug where a 'TypeError' was raised if a string containing a leading - number was sorted with alpha-only strings when 'LOCALE' is used. - -03-26-2015 v. 3.5.3 -+++++++++++++++++++ - - - Fixed bug where '--reverse-filter' option in shell script was not - getting checked for correctness. - - Documentation updates to better describe locale bug, and illustrate - upcoming default behavior change. - - Internal improvements, including making test suite more granular. - -01-13-2015 v. 3.5.2 -+++++++++++++++++++ - - - Enhancement that will convert a 'pathlib.Path' object to a 'str' if - 'ns.PATH' is enabled. - -09-25-2014 v. 3.5.1 -+++++++++++++++++++ - - - Fixed bug that caused list/tuples to fail when using 'ns.LOWECASEFIRST' - or 'ns.IGNORECASE'. - - Refactored modules so that only the public API was in natsort.py and - ns_enum.py. - - Refactored all import statements to be absolute, not relative. - - -09-02-2014 v. 3.5.0 -+++++++++++++++++++ - - - Added the 'alg' argument to the 'natsort' functions. This argument - accepts an enum that is used to indicate the options the user wishes - to use. The 'number_type', 'signed', 'exp', 'as_path', and 'py3_safe' - options are being deprecated and will become (undocumented) - keyword-only options in natsort version 4.0.0. - - The user can now modify how 'natsort' handles the case of non-numeric - characters. - - The user can now instruct 'natsort' to use locale-aware sorting, which - allows 'natsort' to perform true "human sorting". - - - The `humansorted` convenience function has been included to make this - easier. - - - Updated shell script with locale functionality. - -08-12-2014 v. 3.4.1 -+++++++++++++++++++ - - - 'natsort' will now use the 'fastnumbers' module if it is installed. This - gives up to an extra 30% boost in speed over the previous performance - enhancements. - - Made documentation point to more 'natsort' resources, and also added a - new example in the examples section. - -07-19-2014 v. 3.4.0 -+++++++++++++++++++ - - - Fixed a bug that caused user's options to the 'natsort_key' to not be - passed on to recursive calls of 'natsort_key'. - - Added a 'natsort_keygen' function that will generate a wrapped version - of 'natsort_key' that is easier to call. 'natsort_key' is now set to - deprecate at natsort version 4.0.0. - - Added an 'as_path' option to 'natsorted' & co. that will try to treat - input strings as filepaths. This will help yield correct results for - OS-generated inputs like - ``['/p/q/o.x', '/p/q (1)/o.x', '/p/q (10)/o.x', '/p/q/o (1).x']``. - - Massive performance enhancements for string input (1.8x-2.0x), at the expense - of reduction in speed for numeric input (~2.0x). - - - This is a good compromise because the most common input will be strings, - not numbers, and sorting numbers still only takes 0.6x the time of sorting - strings. If you are sorting only numbers, you would use 'sorted' anyway. - - - Added the 'order_by_index' function to help in using the output of - 'index_natsorted' and 'index_versorted'. - - Added the 'reverse' option to 'natsorted' & co. to make it's API more - similar to the builtin 'sorted'. - - Added more unit tests. - - Added auxillary test code that helps in profiling and stress-testing. - - Reworked the documentation, moving most of it to PyPI's hosting platform. - - Added support for coveralls.io. - - Entire codebase is now PyFlakes and PEP8 compliant. - -06-28-2014 v. 3.3.0 -+++++++++++++++++++ - - - Added a 'versorted' method for more convenient sorting of versions. - - Updated command-line tool --number_type option with 'version' and 'ver' - to make it more clear how to sort version numbers. - - Moved unit-testing mechanism from being docstring-based to actual unit tests - in actual functions. - - - This has provided the ability determine the coverage of the unit tests (99%). - - This also makes the pydoc documentation a bit more clear. - - - Made docstrings for public functions mirror the README API. - - Connected natsort development to Travis-CI to help ensure quality releases. - -06-20-2014 v. 3.2.1 -+++++++++++++++++++ - - - Re-"Fixed" unorderable types issue on Python 3.x - this workaround - is for when the problem occurs in the middle of the string. - -05-07-2014 v. 3.2.0 -+++++++++++++++++++ - - - "Fixed" unorderable types issue on Python 3.x with a workaround that - attempts to replicate the Python 2.x behavior by putting all the numbers - (or strings that begin with numbers) first. - - Now explicitly excluding __pycache__ from releases by adding a prune statement - to MANIFEST.in. - -05-05-2014 v. 3.1.2 -+++++++++++++++++++ - - - Added setup.cfg to support universal wheels. - - Added Python 3.0 and Python 3.1 as requiring the argparse module. - -03-01-2014 v. 3.1.1 -+++++++++++++++++++ - - - Added ability to sort lists of lists. - - Cleaned up import statements. - -01-20-2014 v. 3.1.0 -+++++++++++++++++++ - - - Added the ``signed`` and ``exp`` options to allow finer tuning of the sorting - - Entire codebase now works for both Python 2 and Python 3 without needing to run - ``2to3``. - - Updated all doctests. - - Further simplified the ``natsort`` base code by removing unneeded functions. - - Simplified documentation where possible. - - Improved the shell script code - - - Made the documentation less "path"-centric to make it clear it is not just - for sorting file paths. - - Removed the filesystem-based options because these can be achieved better - though a pipeline. - - Added doctests. - - Added new options that correspond to ``signed`` and ``exp``. - - The user can now specify multiple numbers to exclude or multiple ranges - to filter by. - -10-01-2013 v. 3.0.2 -+++++++++++++++++++ - - - Made float, int, and digit searching algorithms all share the same base function. - - Fixed some outdated comments. - - Made the ``__version__`` variable available when importing the module. - -8-15-2013 v. 3.0.1 -++++++++++++++++++ - - - Added support for unicode strings. - - Removed extraneous ``string2int`` function. - - Fixed empty string removal function. - -7-13-2013 v. 3.0.0 -++++++++++++++++++ - - - Added a ``number_type`` argument to the sorting functions to specify how - liberal to be when deciding what a number is. - - Reworked the documentation. - -6-25-2013 v. 2.2.0 -++++++++++++++++++ - - - Added ``key`` attribute to ``natsorted`` and ``index_natsorted`` so that - it mimics the functionality of the built-in ``sorted`` - - Added tests to reflect the new functionality, as well as tests demonstrating - how to get similar functionality using ``natsort_key``. - -12-5-2012 v. 2.1.0 -++++++++++++++++++ - - - Reorganized package. - - Now using a platform independent shell script generator (entry_points - from distribute). - - Can now execute natsort from command line with ``python -m natsort`` - as well. - -11-30-2012 v. 2.0.2 -+++++++++++++++++++ - - - Added the use_2to3 option to setup.py. - - Added distribute_setup.py to the distribution. - - Added dependency to the argparse module (for python2.6). - -11-21-2012 v. 2.0.1 -+++++++++++++++++++ - - - Reorganized directory structure. - - Added tests into the natsort.py file iteself. - -11-16-2012, v. 2.0.0 -++++++++++++++++++++ - - - Updated sorting algorithm to support floats (including exponentials) and - basic version number support. - - Added better README documentation. - - Added doctests. diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 05c7aea..0000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,275 +0,0 @@ -# -*- coding: utf-8 -*- -# -# natsort documentation build configuration file, created by -# sphinx-quickstart on Thu Jul 17 21:01:29 2014. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', - 'sphinx.ext.mathjax', - 'sphinx.ext.napoleon', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -# source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'natsort' -# noinspection PyShadowingBuiltins -copyright = u'2014, Seth M. Morton' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The full version, including alpha/beta/rc tags. -release = '5.4.1' -# The short X.Y version. -version = '.'.join(release.split('.')[0:2]) - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# today = '' -# Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# exclude_patterns = ['solar/*'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' -highlight_language = 'python' - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -on_rtd = os.environ.get('READTHEDOCS') == 'True' -if on_rtd: - html_theme = 'default' -else: - import sphinx_rtd_theme - - html_theme = 'sphinx_rtd_theme' - # html_theme = 'solar' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ['.'] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -# html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -# html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -# html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -# html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -# html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -# html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# html_additional_pages = {} - -# If false, no module index is generated. -# html_domain_indices = True - -# If false, no index is generated. -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'natsortdoc' - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # 'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'natsort.tex', u'natsort Documentation', - u'Seth M. Morton', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = False - -# If true, show page references after internal links. -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'natsort', u'natsort Documentation', - [u'Seth M. Morton'], 1) -] - -# If true, show URL addresses after external links. -# man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'natsort', u'natsort Documentation', - u'Seth M. Morton', 'natsort', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -# texinfo_appendices = [] - -# If false, no module index is generated. -# texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -# texinfo_no_detailmenu = False - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} diff --git a/docs/source/examples.rst b/docs/source/examples.rst deleted file mode 100644 index 29d1aea..0000000 --- a/docs/source/examples.rst +++ /dev/null @@ -1,366 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -.. _examples: - -Examples and Recipes -==================== - -If you want more detailed examples than given on this page, please see -https://github.com/SethMMorton/natsort/tree/master/test_natsort. - -.. contents:: - :local: - -Basic Usage ------------ - -In the most basic use case, simply import :func:`~natsorted` and use -it as you would :func:`sorted`: - -.. code-block:: python - - >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] - >>> sorted(a) - ['1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '2 ft 7 in', '7 ft 6 in'] - >>> from natsort import natsorted, ns - >>> natsorted(a) - ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] - -Sort Version Numbers --------------------- - -As of :mod:`natsort` version >= 4.0.0, :func:`~natsorted` will now properly -sort version numbers. The old function :func:`~versorted` exists for -backwards compatibility but new development should use :func:`~natsorted`. - -.. _rc_sorting: - -Sorting with Alpha, Beta, and Release Candidates -++++++++++++++++++++++++++++++++++++++++++++++++ - -By default, if you wish to sort versions with a non-strict versioning -scheme, you may not get the results you expect: - -.. code-block:: python - - >>> a = ['1.2', '1.2rc1', '1.2beta2', '1.2beta1', '1.2alpha', '1.2.1', '1.1', '1.3'] - >>> natsorted(a) - ['1.1', '1.2', '1.2.1', '1.2alpha', '1.2beta1', '1.2beta2', '1.2rc1', '1.3'] - -To make the '1.2' pre-releases come before '1.2.1', you need to use the following -recipe: - -.. code-block:: python - - >>> natsorted(a, key=lambda x: x.replace('.', '~')) - ['1.1', '1.2', '1.2alpha', '1.2beta1', '1.2beta2', '1.2rc1', '1.2.1', '1.3'] - -If you also want '1.2' after all the alpha, beta, and rc candidates, you can -modify the above recipe: - -.. code-block:: python - - >>> natsorted(a, key=lambda x: x.replace('.', '~')+'z') - ['1.1', '1.2alpha', '1.2beta1', '1.2beta2', '1.2rc1', '1.2', '1.2.1', '1.3'] - -Please see `this issue `_ to -see why this works. - -.. _path_sort: - -Sort OS-Generated Paths ------------------------ - -In some cases when sorting file paths with OS-Generated names, the default -:mod:`~natsorted` algorithm may not be sufficient. In cases like these, -you may need to use the ``ns.PATH`` option: - -.. code-block:: python - - >>> a = ['./folder/file (1).txt', - ... './folder/file.txt', - ... './folder (1)/file.txt', - ... './folder (10)/file.txt'] - >>> natsorted(a) - ['./folder (1)/file.txt', './folder (10)/file.txt', './folder/file (1).txt', './folder/file.txt'] - >>> natsorted(a, alg=ns.PATH) - ['./folder/file.txt', './folder/file (1).txt', './folder (1)/file.txt', './folder (10)/file.txt'] - -Locale-Aware Sorting (Human Sorting) ------------------------------------- - -.. note:: - Please read :ref:`locale_issues` before using ``ns.LOCALE``, :func:`humansorted`, - or :func:`index_humansorted`. - -You can instruct :mod:`natsort` to use locale-aware sorting with the -``ns.LOCALE`` option. In addition to making this understand non-ASCII -characters, it will also properly interpret non-'.' decimal separators -and also properly order case. It may be more convenient to just use -the :func:`humansorted` function: - -.. code-block:: python - - >>> from natsort import humansorted - >>> import locale - >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') - 'en_US.UTF-8' - >>> a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana'] - >>> natsorted(a, alg=ns.LOCALE) - ['apple', 'Apple', 'banana', 'Banana', 'corn', 'Corn'] - >>> humansorted(a) - ['apple', 'Apple', 'banana', 'Banana', 'corn', 'Corn'] - -You may find that if you do not explicitly set the locale your results may not -be as you expect... I have found that it depends on the system you are on. -If you use `PyICU `_ (see below) then -you should not need to do this. - -.. _case_sort: - -Controlling Case When Sorting ------------------------------ - -For non-numbers, by default :mod:`natsort` used ordinal sorting (i.e. -it sorts by the character's value in the ASCII table). For example: - -.. code-block:: python - - >>> a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana'] - >>> natsorted(a) - ['Apple', 'Banana', 'Corn', 'apple', 'banana', 'corn'] - -There are times when you wish to ignore the case when sorting, -you can easily do this with the ``ns.IGNORECASE`` option: - -.. code-block:: python - - >>> natsorted(a, alg=ns.IGNORECASE) - ['Apple', 'apple', 'Banana', 'banana', 'corn', 'Corn'] - -Note thats since Python's sorting is stable, the order of equivalent -elements after lowering the case is the same order they appear in the -original list. - -Upper-case letters appear first in the ASCII table, but many natural -sorting methods place lower-case first. To do this, use -``ns.LOWERCASEFIRST``: - -.. code-block:: python - - >>> natsorted(a, alg=ns.LOWERCASEFIRST) - ['apple', 'banana', 'corn', 'Apple', 'Banana', 'Corn'] - -It may be undesirable to have the upper-case letters grouped together -and the lower-case letters grouped together; most would expect all -"a"s to bet together regardless of case, and all "b"s, and so on. To -achieve this, use ``ns.GROUPLETTERS``: - -.. code-block:: python - - >>> natsorted(a, alg=ns.GROUPLETTERS) - ['Apple', 'apple', 'Banana', 'banana', 'Corn', 'corn'] - -You might combine this with ``ns.LOWERCASEFIRST`` to get what most -would expect to be "natural" sorting: - -.. code-block:: python - - >>> natsorted(a, alg=ns.G | ns.LF) - ['apple', 'Apple', 'banana', 'Banana', 'corn', 'Corn'] - -Customizing Float Definition ----------------------------- - -You can make :func:`~natsorted` search for any float that would be -a valid Python float literal, such as 5, 0.4, -4.78, +4.2E-34, etc. -using the ``ns.FLOAT`` key. You can disable the exponential component -of the number with ``ns.NOEXP``. - -.. code-block:: python - - >>> a = ['a50', 'a51.', 'a+50.4', 'a5.034e1', 'a+50.300'] - >>> natsorted(a, alg=ns.FLOAT) - ['a50', 'a5.034e1', 'a51.', 'a+50.300', 'a+50.4'] - >>> natsorted(a, alg=ns.FLOAT | ns.SIGNED) - ['a50', 'a+50.300', 'a5.034e1', 'a+50.4', 'a51.'] - >>> natsorted(a, alg=ns.FLOAT | ns.SIGNED | ns.NOEXP) - ['a5.034e1', 'a50', 'a+50.300', 'a+50.4', 'a51.'] - -For convenience, the ``ns.REAL`` option is provided which is a shortcut -for ``ns.FLOAT | ns.SIGNED`` and can be used to sort on real numbers. -This can be easily accessed with the :func:`~realsorted` convenience -function. Please note that the behavior of the :func:`~realsorted` function -was the default behavior of :func:`~natsorted` for :mod:`natsort` -version < 4.0.0: - -.. code-block:: python - - >>> natsorted(a, alg=ns.REAL) - ['a50', 'a+50.300', 'a5.034e1', 'a+50.4', 'a51.'] - >>> from natsort import realsorted - >>> realsorted(a) - ['a50', 'a+50.300', 'a5.034e1', 'a+50.4', 'a51.'] - -.. _custom_sort: - -Using a Custom Sorting Key --------------------------- - -Like the built-in ``sorted`` function, ``natsorted`` can accept a custom -sort key so that: - -.. code-block:: python - - >>> from operator import attrgetter, itemgetter - >>> a = [['a', 'num4'], ['b', 'num8'], ['c', 'num2']] - >>> natsorted(a, key=itemgetter(1)) - [['c', 'num2'], ['a', 'num4'], ['b', 'num8']] - >>> class Foo: - ... def __init__(self, bar): - ... self.bar = bar - ... def __repr__(self): - ... return "Foo('{0}')".format(self.bar) - >>> b = [Foo('num3'), Foo('num5'), Foo('num2')] - >>> natsorted(b, key=attrgetter('bar')) - [Foo('num2'), Foo('num3'), Foo('num5')] - -Generating a Natsort Key ------------------------- - -If you need to sort a list in-place, you cannot use :func:`~natsorted`; you -need to pass a key to the :meth:`list.sort` method. The function -:func:`~natsort_keygen` is a convenient way to generate these keys for you: - -.. code-block:: python - - >>> from natsort import natsort_keygen - >>> a = ['a50', 'a51.', 'a50.4', 'a5.034e1', 'a50.300'] - >>> natsort_key = natsort_keygen(alg=ns.FLOAT) - >>> a.sort(key=natsort_key) - >>> a - ['a50', 'a50.300', 'a5.034e1', 'a50.4', 'a51.'] - -:func:`~natsort_keygen` has the same API as :func:`~natsorted` (minus the -`reverse` option). - -Natural Sorting with ``cmp`` (Python 2 only) --------------------------------------------- - -.. note:: - This is a Python2-only feature! The :func:`natcmp` function is not - exposed on Python3. Because this documentation is built with - Python3, you will not find :func:`natcmp` in the API. - -If you are using a legacy codebase that requires you to use :func:`cmp` instead -of a key-function, you can use :func:`~natcmp`. - -.. code-block:: python - - >>> import sys - >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] - >>> if sys.version_info[0] == 2: - ... from natsort import natcmp - ... sorted(a, cmp=natcmp) - ... else: - ... natsorted(a) # so docstrings don't fail - ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] - -:func:`natcmp` also accepts an ``alg`` argument so you can customize your -sorting experience. - -Sorting Multiple Lists According to a Single List -------------------------------------------------- - -Sometimes you have multiple lists, and you want to sort one of those -lists and reorder the other lists according to how the first was sorted. -To achieve this you could use the :func:`~index_natsorted` in combination -with the convenience function -:func:`~order_by_index`: - -.. code-block:: python - - >>> from natsort import index_natsorted, order_by_index - >>> a = ['a2', 'a9', 'a1', 'a4', 'a10'] - >>> b = [4, 5, 6, 7, 8] - >>> c = ['hi', 'lo', 'ah', 'do', 'up'] - >>> index = index_natsorted(a) - >>> order_by_index(a, index) - ['a1', 'a2', 'a4', 'a9', 'a10'] - >>> order_by_index(b, index) - [6, 4, 7, 5, 8] - >>> order_by_index(c, index) - ['ah', 'hi', 'do', 'lo', 'up'] - -Returning Results in Reverse Order ----------------------------------- - -Just like the :func:`sorted` built-in function, you can supply the -``reverse`` option to return the results in reverse order: - -.. code-block:: python - - >>> a = ['a2', 'a9', 'a1', 'a4', 'a10'] - >>> natsorted(a, reverse=True) - ['a10', 'a9', 'a4', 'a2', 'a1'] - -Sorting Bytes on Python 3 -------------------------- - -Python 3 is rather strict about comparing strings and bytes, and this -can make it difficult to deal with collections of both. Because of the -challenge of guessing which encoding should be used to decode a bytes -array to a string, :mod:`natsort` does *not* try to guess and automatically -convert for you; in fact, the official stance of :mod:`natsort` is to -not support sorting bytes. Instead, some decoding convenience functions -have been provided to you (see :ref:`bytes_help`) that allow you to -provide a codec for decoding bytes through the ``key`` argument that -will allow :mod:`natsort` to convert byte arrays to strings for sorting; -these functions know not to raise an error if the input is not a byte -array, so you can use the key on any arbitrary collection of data. - -.. code-block:: python - - >>> from natsort import as_ascii - >>> a = [b'a', 14.0, 'b'] - >>> # On Python 2, natsorted(a) would would work as expected. - >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str()) - >>> natsorted(a, key=as_ascii) == [14.0, b'a', 'b'] - True - -Additionally, regular expressions cannot be run on byte arrays, making it -so that :mod:`natsort` cannot parse them for numbers. As a result, if you -run :mod:`natsort` on a list of bytes, you will get results that are like -Python's default sorting behavior. Of course, you can use the decoding -functions to solve this: - -.. code-block:: python - - >>> from natsort import as_utf8 - >>> a = [b'a56', b'a5', b'a6', b'a40'] - >>> natsorted(a) # doctest: +SKIP - [b'a40', b'a5', b'a56', b'a6'] - >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56'] - True - -If you need a codec different from ASCII or UTF-8, you can use -:func:`decoder` to generate a custom key: - -.. code-block:: python - - >>> from natsort import decoder - >>> a = [b'a56', b'a5', b'a6', b'a40'] - >>> natsorted(a, key=decoder('latin1')) == [b'a5', b'a6', b'a40', b'a56'] - True - -Sorting a Pandas DataFrame --------------------------- - -As of Pandas version 0.16.0, the sorting methods do not accept a ``key`` argument, -so you cannot simply pass :func:`natsort_keygen` to a Pandas DataFrame and sort. -This request has been made to the Pandas devs; see -`issue 3942 `_ if you are interested. -If you need to sort a Pandas DataFrame, please check out -`this answer on StackOverflow `_ -for ways to do this without the ``key`` argument to ``sort``. diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst deleted file mode 100644 index 2415a91..0000000 --- a/docs/source/howitworks.rst +++ /dev/null @@ -1,1113 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -.. _howitworks: - -How Does Natsort Work? -====================== - -.. contents:: - :local: - -:mod:`natsort` works by breaking strings into smaller sub-components (numbers -or everything else), and returning these components in a tuple. Sorting -tuples in Python is well-defined, and this fact is used to sort the input -strings properly. But how does one break a string into sub-components? -And what does one do to those components once they are split? Below I -will explain the algorithm that was chosen for the :mod:`natsort` module, -and some of the thinking that went into those design decisions. I will -also mention some of the stumbling blocks I ran into because -`getting sorting right is surprisingly hard`_. - -If you are impatient, you can skip to :ref:`tldr1` for the algorithm -in the simplest case, and :ref:`tldr2` -to see what extra code is needed to handle special cases. - -First, How Does Natural Sorting Work At a High Level? ------------------------------------------------------ - -If I want to compare '2 ft 7 in' to '2 ft 11 in', I might do the following - -.. code-block:: python - - >>> '2 ft 7 in' < '2 ft 11 in' - False - -We as humans know that the above should be true, but why does Python think it -is false? Here is how it is performing the comparison:: - - '2' <=> '2' ==> equal, so keep going - ' ' <=> ' ' ==> equal, so keep going - 'f' <=> 'f' ==> equal, so keep going - 't' <=> 't' ==> equal, so keep going - ' ' <=> ' ' ==> equal, so keep going - '7' <=> '1' ==> different, use result of '7' < '1' - -'7' evaluates as greater than '1' so the statement is false. When sorting, if -a value is less than another it is placed first, so in our above example -'2 ft 11 in' would end up before '2 ft 7 in', which is not correct. What to do? - -The best way to handle this is to break the string into sub-components -of numbers and non-numbers, and then convert the numeric parts into -:func:`float` or :func:`int` types. This will force Python to -actually understand the context of what it is sorting and then "do the -right thing." Luckily, it handles sorting lists of strings right out-of-the-box, -so the only hard part is actually making this string-to-list transformation -and then Python will handle the rest. - -:: - - '2 ft 7 in' ==> (2, ' ft ', 7, ' in') - '2 ft 11 in' ==> (2, ' ft ', 11, ' in') - -When Python compares the two, it roughly follows the below logic:: - - 2 <=> 2 ==> equal, so keep going - ' ft ' <=> ' ft ' ==> a string is a special type of sequence - evaluate each character individually - || - --> - ' ' <=> ' ' ==> equal, so keep going - 'f' <=> 'f' ==> equal, so keep going - 't' <=> 't' ==> equal, so keep going - ' ' <=> ' ' ==> equal, so keep going - <== Back to parent sequence - 7 <=> 11 ==> different, use the result of 7 < 11 - -Clearly, seven is less than eleven, so our comparison is as we expect, and we -would get the sorting order we wanted. - -At its heart, :mod:`natsort` is simply a tool to break strings into tuples, -turning numbers in strings (i.e. ``'79'``) into *ints* and *floats* as it does this. - -Natsort's Approach ------------------- - -.. contents:: - :local: - -Decomposing Strings Into Sub-Components -+++++++++++++++++++++++++++++++++++++++ - -The first major hurtle to overcome is to decompose the string into sub-components. -Remarkably, this turns out to be the easy part, owing mostly to Python's easy access -to regular expressions. Breaking an arbitrary string based on a pattern is pretty -straightforward. - -.. code-block:: python - - >>> import re - >>> re.split(r'(\d+)', '2 ft 11 in') - ['', '2', ' ft ', '11', ' in'] - -Clear (assuming you can read regular expressions) and concise. - -The reason I began developing :mod:`natsort` in the first place was because I -needed to handle the natural sorting of strings containing *real numbers*, not just -unsigned integers as the above example contains. By real numbers, I mean those like -``-45.4920E-23``. :mod:`natsort` can handle just about any number definition; -to that end, here are all the regular expressions used in :mod:`natsort`: - -.. code-block:: python - - >>> unsigned_int = r'([0-9]+)' - >>> signed_int = r'([-+]?[0-9]+)' - >>> unsigned_float = r'((?:[0-9]+\.?[0-9]*|\.[0-9]+)(?:[eE][-+]?[0-9]+)?)' - >>> signed_float = r'([-+]?(?:[0-9]+\.?[0-9]*|\.[0-9]+)(?:[eE][-+]?[0-9]+)?)' - >>> unsigned_float_no_exponent = r'((?:[0-9]+\.?[0-9]*|\.[0-9]+))' - >>> signed_float_no_exponent = r'([-+]?(?:[0-9]+\.?[0-9]*|\.[0-9]+))' - -Note that ``"inf"`` and ``"nan"`` are deliberately omitted from the float definition because you -wouldn't want (for example) ``"banana"`` to be converted into ``['ba', 'nan', 'a']``, -Let's see an example: - -.. code-block:: python - - >>> re.split(signed_float, 'The mass of 3 electrons is 2.732815068E-30 kg') - ['The mass of ', '3', ' electrons is ', '2.732815068E-30', ' kg'] - -.. note:: - - It is a bit of a lie to say the above are the complete regular expressions. In the - actual code there is also handling for non-ASCII unicode characters (such as ⑦), - but I will ignore that aspect of :mod:`natsort` in this discussion. - -Now, when the user wants to change the definition of a number, it is as easy as changing -the pattern supplied to the regular expression engine. - -Choosing the right default is hard, though (well, in this case it shouldn't have been -but I was rather thick-headed). -In retrospect, it should have been obvious that since essentially all the code examples -I had/have seen for natural sorting were for *unsigned integers*, I should have made the default -definition of a number an *unsigned integer*. But, in the brash days of my youth I assumed -that since my use case was real numbers, everyone else would be happier sorting by real numbers; -so, I made the default definition of a number a *signed float with exponent*. -`This astonished`_ `a lot`_ `of people`_ -(`and some people aren't very nice when they are astonished`_). -Starting with :mod:`natsort` version 4.0.0 the default number definition was -changed to an *unsigned integer* which satisfies the "least astonishment" principle, and -I have not heard a complaint since. - -Coercing Strings Containing Numbers Into Numbers -++++++++++++++++++++++++++++++++++++++++++++++++ - -There has been some debate on Stack Overflow as to what method is best to -coerce a string to a number if it can be coerced, and leaving it alone otherwise -(see `this one for coercion`_ and `this one for checking`_ for some high traffic questions), -but it mostly boils down to two different solutions, shown here: - -.. code-block:: python - - >>> def coerce_try_except(x): - ... try: - ... return int(x) - ... except ValueError: - ... return x - ... - >>> def coerce_regex(x): - ... # Note that precompiling the regex is more performant, - ... # but I do not show that here for clarity's sake. - ... return int(x) if re.match(r'[-+]?\d+$', x) else x - ... - -Here are some timing results run on my machine: - -:: - - In [0]: numbers = list(map(str, range(100))) # A list of numbers as strings - - In [1]: not_numbers = ['banana' + x for x in numbers] - - In [2]: %timeit [coerce_try_except(x) for x in numbers] - 10000 loops, best of 3: 51.1 µs per loop - - In [3]: %timeit [coerce_try_except(x) for x in not_numbers] - 1000 loops, best of 3: 289 µs per loop - - In [4]: %timeit [coerce_regex(x) for x in not_numbers] - 10000 loops, best of 3: 67.6 µs per loop - - In [5]: %timeit [coerce_regex(x) for x in numbers] - 10000 loops, best of 3: 123 µs per loop - -What can we learn from this? The ``try: except`` method (arguably the most "pythonic" -of the solutions) is best for numeric input, but performs over 5X slower for non-numeric -input. Conversely, the regular expression method, though slower than ``try: except`` for -both input types, is more efficient for non-numeric input than for input that can be -converted to an ``int``. Further, even though the regular expression method is slower -for both input types, it is always at least twice as fast as the worst case for the -``try: except``. - -Why do I care? Shouldn't I just pick a method and not worry about it? Probably. However, -I am very conscious about the performance of :mod:`natsort`, and want it to be a true -drop-in replacement for :func:`sorted` without having to incur a performance penalty. -For the purposes of :mod:`natsort`, there is no clear winner between the two algorithms - -the data being passed to this function will likely be a mix of numeric and non-numeric -string content. Do I use the ``try: except`` method and hope the speed gains on -numbers will offset the non-number performance, or do I use regular expressions and -take the more stable performance? - -It turns out that within the context of :mod:`natsort`, some assumptions can be -made that make a hybrid approach attractive. Because all strings are pre-split -into numeric and non-numeric content *before* being passed to this coercion function, -the assumption can be made that *if a string begins with a digit or a sign, it -can be coerced into a number*. - -.. code-block:: python - - >>> def coerce_to_int(x): - ... if x[0] in '0123456789+-': - ... try: - ... return int(x) - ... except ValueError: - ... return x - ... else: - ... return x - ... - -So how does this perform compared to the standard coercion methods? - -:: - - In [6]: %timeit [coerce_to_int(x) for x in numbers] - 10000 loops, best of 3: 71.6 µs per loop - - In [7]: %timeit [coerce_to_int(x) for x in not_numbers] - 10000 loops, best of 3: 26.4 µs per loop - -The hybrid method eliminates most of the time wasted on numbers checking that it -is in fact a number before passing to :func:`int`, and eliminates the time wasted -in the exception stack for input that is not a number. - -That's as fast as we can get, right? In pure Python, probably. At least, it's -close. But because I am crazy and a glutton for punishment, I decided to see -if I could get any faster writing a C extension. It's called -`fastnumbers`_ and contains a C implementation of the above coercion functions -called :func:`fast_int`. How does it fair? Pretty well. - -:: - - In [8]: %timeit [fast_int(x) for x in numbers] - 10000 loops, best of 3: 30.9 µs per loop - - In [9]: %timeit [fast_int(x) for x in not_numbers] - 10000 loops, best of 3: 30 µs per loop - -During development of :mod:`natsort`, I wanted to ensure that using it did not -get in the way of a user's program by introducing a performance penalty to their code. -To that end, I do not feel like my adventures down the rabbit hole of optimization -of coercion functions was a waste; I can confidently look users in the eye and -say I considered every option in ensuring :mod:`natsort` is as efficient as possible. -This is why if `fastnumbers`_ is installed it will be used for this step, -and otherwise the hybrid method will be used. - -.. note:: - - Modifying the hybrid coercion function for floats is straightforward. - - .. code-block:: python - - >>> def coerce_to_float(x): - ... if x[0] in '.0123456789+-' or x.lower().lstrip()[:3] in ('nan', 'inf'): - ... try: - ... return float(x) - ... except ValueError: - ... return x - ... else: - ... return x - ... - -.. _tldr1: - -TL;DR 1 - The Simple "No Special Cases" Algorithm -+++++++++++++++++++++++++++++++++++++++++++++++++ - -At this point, our :mod:`natsort` algorithm is essentially the following: - -.. code-block:: python - - >>> import re - >>> def natsort_key(x, as_float=False, signed=False): - ... if as_float: - ... regex = signed_float if signed else unsigned_float - ... else: - ... regex = signed_int if signed else unsigned_int - ... split_input = re.split(regex, x) - ... split_input = filter(None, split_input) # removes null strings - ... coerce = coerce_to_float if as_float else coerce_to_int - ... return tuple(coerce(s) for s in split_input) - ... - -I have written the above for clarity and not performance. -This pretty much matches `most natural sort solutions for python on Stack Overflow`_ -(except the above includes customization of the definition of a number). - -Special Cases Everywhere! -------------------------- - -.. contents:: - :local: - -.. image:: special_cases_everywhere.jpg - -If what I described in :ref:`TL;DR 1 ` were -all that :mod:`natsort` needed to -do then there probably wouldn't be much need for a third-party module, right? -Probably. But it turns out that in real-world data there are a lot of -special cases that need to be handled, and in true `80%/20%`_ fashion, the -majority of the code in :mod:`natsort` is devoted to handling special cases -like those described below. - -Sorting Filesystem Paths -++++++++++++++++++++++++ - -`The first major special case I encountered was sorting filesystem paths`_ -(if you go to the link, you will see I didn't handle it well for a year... -this was before I fully realized how much functionality I could really add -to :mod:`natsort`). Let's apply the :func:`natsort_key` from above to some -filesystem paths that you might see being auto-generated from your operating -system: - -.. code-block:: python - - >>> paths = ['/p/Folder (10)/file.tar.gz', - ... '/p/Folder/file.tar.gz', - ... '/p/Folder (1)/file (1).tar.gz', - ... '/p/Folder (1)/file.tar.gz'] - >>> sorted(paths, key=natsort_key) - ['/p/Folder (1)/file (1).tar.gz', '/p/Folder (1)/file.tar.gz', '/p/Folder (10)/file.tar.gz', '/p/Folder/file.tar.gz'] - -Well that's not right! What is ``'/p/Folder/file.tar.gz'`` doing at the end? -It has to do with the numerical ASCII code assigned to the space and -``/`` characters in the `ASCII table`_. According to the `ASCII table`_, the -space character (number 32) comes before the ``/`` character (number 47). If -we remove the common prefix in all of the above strings (``'/p/Folder'``), we -can see why this happens: - -.. code-block:: python - - >>> ' (1)/file.tar.gz' < '/file.tar.gz' - True - >>> ' ' < '/' - True - -This isn't very convenient... how do we solve it? We can split the path -across the path separators and then sort. A convenient way do to this is -with the `Path.parts`_ method from :mod:`pathlib`: - -.. code-block:: python - - >>> import pathlib - >>> sorted(paths, key=lambda x: tuple(natsort_key(s) for s in pathlib.Path(x).parts)) - ['/p/Folder/file.tar.gz', '/p/Folder (1)/file (1).tar.gz', '/p/Folder (1)/file.tar.gz', '/p/Folder (10)/file.tar.gz'] - -Almost! It seems like there is some funny business going on in the final -filename component as well. We can solve that nicely and quickly with `Path.suffixes`_ -and `Path.stem`_. - -.. code-block:: python - - >>> def decompose_path_into_components(x): - ... path_split = list(pathlib.Path(x).parts) - ... # Remove the final filename component from the path. - ... final_component = pathlib.Path(path_split.pop()) - ... # Split off all the extensions. - ... suffixes = final_component.suffixes - ... stem = final_component.name.replace(''.join(suffixes), '') - ... # Remove the '.' prefix of each extension, and make that - ... # final component a list of the stem and each suffix. - ... final_component = [stem] + [x[1:] for x in suffixes] - ... # Replace the split final filename component. - ... path_split.extend(final_component) - ... return path_split - ... - >>> def natsort_key_with_path_support(x): - ... return tuple(natsort_key(s) for s in decompose_path_into_components(x)) - ... - >>> sorted(paths, key=natsort_key_with_path_support) - ['/p/Folder/file.tar.gz', '/p/Folder (1)/file.tar.gz', '/p/Folder (1)/file (1).tar.gz', '/p/Folder (10)/file.tar.gz'] - -This works because in addition to breaking the input by path separators, the final -filename component is separated from its extensions as well [#f1]_. *Then*, each of these -separated components is sent to the :mod:`natsort` algorithm, so the result is -a tuple of tuples. Once that is done, we can see how comparisons can be done in -the expected manner. - -.. code-block:: python - - >>> a = natsort_key_with_path_support('/p/Folder (1)/file (1).tar.gz') - >>> a - (('/',), ('p',), ('Folder (', 1, ')'), ('file (', 1, ')'), ('tar',), ('gz',)) - >>> - >>> b = natsort_key_with_path_support('/p/Folder/file.tar.gz') - >>> b - (('/',), ('p',), ('Folder',), ('file',), ('tar',), ('gz',)) - >>> - >>> a > b - True - -Comparing Different Types on Python 3 -+++++++++++++++++++++++++++++++++++++ - -`The second major special case I encountered was sorting of different types`_. -If you are on Python 2 (i.e. legacy Python), this mostly doesn't matter *too* -much since it uses an arbitrary heuristic to allow traditionally un-comparable -types to be compared (such as comparing ``'a'`` to ``1``). However, on Python 3 -(i.e. Python) it simply won't let you perform such nonsense, raising a -:exc:`TypeError` instead. - -You can imagine that a module that breaks strings into tuples of numbers and -strings is walking a dangerous line if it does not have special handling for -comparing numbers and strings. My imagination was not so great at first. -Let's take a look at all the ways this can fail with real-world data. - -.. code-block:: python - - >>> def natsort_key_with_poor_real_number_support(x): - ... split_input = re.split(signed_float, x) - ... split_input = filter(None, split_input) # removes null strings - ... return tuple(coerce_to_float(s) for s in split_input) - >>> - >>> sorted([5, '4'], key=natsort_key_with_poor_real_number_support) - Traceback (most recent call last): - ... - TypeError: ... - >>> - >>> sorted(['12 apples', 'apples'], key=natsort_key_with_poor_real_number_support) - Traceback (most recent call last): - ... - TypeError: ... - >>> - >>> sorted(['version5.3.0', 'version5.3rc1'], key=natsort_key_with_poor_real_number_support) - Traceback (most recent call last): - ... - TypeError: ... - -Let's break these down. - -#. The integer ``5`` is sent to ``re.split`` which expects only strings - or bytes, which is a no-no. -#. ``natsort_key_with_poor_real_number_support('12 apples') < natsort_key_with_poor_real_number_support('apples')`` - is the same as ``(12.0, ' apples') < ('apples',)``, and thus a number gets - compared to a string [#f2]_ which also is a no-no. -#. This one scores big on the astonishment scale, especially if one accidentally - uses signed integers or real numbers when they mean to use unsigned integers. - ``natsort_key_with_poor_real_number_support('version5.3.0') < natsort_key_with_poor_real_number_support('version5.3rc1')`` - is the same as ``('version', 5.3, 0.0) < ('version', 5.3, 'rc', 1.0)``, so in the - third element a number gets compared to a string, once again the same - old no-no. (The same would happen with ``'version5-3'`` and ``'version5-a'``, - which would be come ``('version', 5, -3)`` and ``('version', 5, '-a')``). - -As you might expect, the solution to the first issue is to wrap the ``re.split`` -call in a ``try: except:`` block and handle the number specially if a -:exc:`TypeError` is raised. The second and third cases *could* be handled -in a "special case" manner, meaning only respond and do something different -if these problems are detected. But a less error-prone method is to ensure -that the data is correct-by-construction, and this can be done by ensuring -that the returned tuples *always* start with a string, and then alternate -in a string-number-string-number-string patter;n this can be achieved by -adding an empty string wherever the pattern is not followed [#f3]_. This ends -up working out pretty nicely because empty strings are always "less" than -any non-empty string, and we typically want numbers to come before strings. - -Let's take a look at how this works out. - -.. code-block:: python - - >>> from natsort.utils import sep_inserter - >>> list(sep_inserter(iter(['apples']), '')) - ['apples'] - >>> - >>> list(sep_inserter(iter([12, ' apples']), '')) - ['', 12, ' apples'] - >>> - >>> list(sep_inserter(iter(['version', 5, -3]), '')) - ['version', 5, '', -3] - >>> - >>> from natsort import natsort_keygen, ns - >>> natsort_key_with_good_real_number_support = natsort_keygen(alg=ns.REAL) - >>> - >>> sorted([5, '4'], key=natsort_key_with_good_real_number_support) - ['4', 5] - >>> - >>> sorted(['12 apples', 'apples'], key=natsort_key_with_good_real_number_support) - ['12 apples', 'apples'] - >>> - >>> sorted(['version5.3.0', 'version5.3rc1'], key=natsort_key_with_good_real_number_support) - ['version5.3.0', 'version5.3rc1'] - -How the "good" version works will be given in `TL;DR 2 - Handling Crappy, Real-World Input`_. - -Handling NaN -++++++++++++ - -`A rather unexpected special case I encountered was sorting collections containing NaN`_. -Let's see what happens when you try to sort a plain old list of numbers when there -is a **NaN** floating around in there. - -.. code-block:: python - - >>> danger = [7, float('nan'), 22.7, 19, -14, 59.123, 4] - >>> sorted(danger) - [7, nan, -14, 4, 19, 22.7, 59.123] - -Clearly that isn't correct, and for once it isn't my fault! -`It's hard to compare floating point numbers`_. By definition, **NaN** is unorderable -to any other number, and is never equal to any other number, including itself. - -.. code-block:: python - - >>> nan = float('nan') - >>> 5 > nan - False - >>> 5 < nan - False - >>> 5 == nan - False - >>> 5 != nan - True - >>> nan == nan - False - >>> nan != nan - True - -The implication of all this for us is that if there is an **NaN** in the -data-set we are trying to sort, the data-set will end up being sorted in -two separate yet individually sorted sequences - the one *before* the **NaN**, -and the one *after*. This is because the ``<`` operation that is used -to sort always returns :const:`False` with **NaN**. - -Because :mod:`natsort` aims to sort sequences in a way that does not surprise -the user, keeping this behavior is not acceptable (I don't require my users -to know how **NaN** will behave in a sorting algorithm). The simplest way to -satisfy the "least astonishment" principle is to substitute **NaN** with -some other value. But what value is *least* astonishing? I chose to replace -**NaN** with :math:`-\infty` so that these poorly behaved elements always -end up at the front where the users will most likely be alerted to their presence. - -.. code-block:: python - - >>> def fix_nan(x): - ... if x != x: # only true for NaN - ... return float('-inf') - ... else: - ... return x - ... - -Let's check out :ref:`TL;DR 2 ` to see how this can be -incorporated into the simple key function from :ref:`TL;DR 1 `. - -.. _tldr2: - -TL;DR 2 - Handling Crappy, Real-World Input -+++++++++++++++++++++++++++++++++++++++++++ - -Let's see how our elegant key function from :ref:`TL;DR 1 ` has -become bastardized in order to support handling mixed real-world data -and user customizations. - - >>> def natsort_key(x, as_float=False, signed=False, as_path=False): - ... if as_float: - ... regex = signed_float if signed else unsigned_float - ... else: - ... regex = signed_int if signed else unsigned_int - ... try: - ... if as_path: - ... x = decompose_path_into_components(x) # Decomposes into list of strings - ... # If this raises a TypeError, input is not a string. - ... split_input = re.split(regex, x) - ... except TypeError: - ... try: - ... # Does this need to be applied recursively (list-of-list)? - ... return tuple(map(natsort_key, x)) - ... except TypeError: - ... # Must be a number - ... ret = ('', fix_nan(x)) # Maintain string-number-string pattern - ... return (ret,) if as_path else ret # as_path returns tuple-of-tuples - ... else: - ... split_input = filter(None, split_input) # removes null strings - ... # Note that the coerce_to_int/coerce_to_float functions - ... # are also modified to use the fix_nan function. - ... if as_float: - ... coerced_input = (coerce_to_float(s) for s in split_input) - ... else: - ... coerced_input = (coerce_to_int(s) for s in split_input) - ... return tuple(sep_inserter(coerced_input, '')) - ... - -And this doesn't even show handling :class:`bytes` type! Notice that we have -to do non-obvious things like modify the return form of numbers when ``as_path`` -is given, just to avoid comparing strings and numbers for the case in which a user provides -input like ``['/home/me', 42]``. - -Let's take it out for a spin! - -.. code-block:: python - - >>> danger = [7, float('nan'), 22.7, '19', '-14', '59.123', 4] - >>> sorted(danger, key=lambda x: natsort_key(x, as_float=True, signed=True)) - [nan, '-14', 4, 7, '19', 22.7, '59.123'] - >>> - >>> paths = ['/p/Folder (1)/file.tar.gz', - ... '/p/Folder/file.tar.gz', - ... 123456] - >>> sorted(paths, key=lambda x: natsort_key(x, as_path=True)) - [123456, '/p/Folder/file.tar.gz', '/p/Folder (1)/file.tar.gz'] - -Here Be Dragons: Adding Locale Support --------------------------------------- - -.. contents:: - :local: - -Probably the most challenging special case I had to handle was getting -:mod:`natsort` to handle sorting the non-numerical parts of input -correctly, and also allowing it to sort the numerical bits in different -locales. This was in no way what I originally set out to do with this -library, so I was `caught a bit off guard when the request was initially made`_. -I discovered the :mod:`locale` library, and assumed that if it's part of Python's -StdLib there can't be too many dragons, right? - -.. admonition:: INCOMPLETE LIST OF DRAGONS - - - https://github.com/SethMMorton/natsort/issues/21 - - https://github.com/SethMMorton/natsort/issues/22 - - https://github.com/SethMMorton/natsort/issues/23 - - https://github.com/SethMMorton/natsort/issues/36 - - https://github.com/SethMMorton/natsort/issues/44 - - https://bugs.python.org/issue2481 - - https://bugs.python.org/issue23195 - - https://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help - - https://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation - - https://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm - - https://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators - - https://stackoverflow.com/questions/45734562/how-can-i-get-a-reasonable-string-sorting-with-python - -These can be summed up as follows: - -#. :mod:`locale` is a thin wrapper over your operating system's *locale* - library, so if *that* is broken (like it is on BSD and OSX) then - :mod:`locale` is broken in Python. -#. Because of a bug in legacy Python (i.e. Python 2), there is no uniform way to use - the :mod:`locale` sorting functionality between legacy Python and Python 3. -#. People have differing opinions of how capitalization should affect word order. -#. There is no built-in way to handle locale-dependent thousands separators - and decimal points *robustly*. -#. Proper handling of Unicode is complicated. -#. Proper handling of :mod:`locale` is complicated. - -Easily over half of the the code in :mod:`natsort` is in some way dealing with some -aspect of :mod:`locale` or basic case handling. It would have been -impossible to get right without a `really good`_ `testing strategy`_. - -Don't expect any more TL;DR's... if you want to see how all this is fully -incorporated into the :mod:`natsort` algorithm then please take a look -`at the code`_. However, I will hint at how specific steps are taken in -each section. - -Let's see how we can handle some of the dragons, one-by-one. - -Basic Case Control Support -++++++++++++++++++++++++++ - -Without even thinking about the mess that is adding :mod:`locale` support, -:mod:`natsort` can introduce support for controlling how case is interpreted. - -First, let's take a look at how it is sorted by default (due to -where characters lie on the `ASCII table`_). - -.. code-block:: python - - >>> a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana'] - >>> sorted(a) - ['Apple', 'Banana', 'Corn', 'apple', 'banana', 'corn'] - -All uppercase letters come before lowercase letters in the `ASCII table`_, -so all capitalized words appear first. Not everyone agrees that this -is the correct order. Some believe that the capitalized words should -be last (``['apple', 'banana', 'corn', 'Apple', 'Banana', 'Corn']``). -Some believe that both the lowercase and uppercase versions -should appear together (``['Apple', 'apple', 'Banana', 'banana', 'Corn', 'corn']``). -Some believe that both should be true ☹. Some people don't care at all [#f4]_. - -Solving the first case (I call it *LOWERCASEFIRST*) is actually pretty -easy... just call the :meth:`str.swapcase` method on the input. - -.. code-block:: python - - >>> sorted(a, key=lambda x: x.swapcase()) - ['apple', 'banana', 'corn', 'Apple', 'Banana', 'Corn'] - -The last (i call it *IGNORECASE*) should be super easy, right? -Simply call :meth:`str.lowercase` on the input. This will work but may -not always give the correct answer on non-latin character sets. It's -a good thing that in Python 3.3 -:meth:`str.casefold` was introduced, which does a better job of removing -all case information from unicode characters in -non-latin alphabets. - -.. code-block:: python - - >>> def remove_case(x): - ... try: - ... return x.casefold() - ... except AttributeError: # Legacy Python backwards compatibility - ... return x.lowercase() - ... - >>> sorted(a, key=remove_case) - ['Apple', 'apple', 'Banana', 'banana', 'corn', 'Corn'] - -The middle case (I call it *GROUPLETTERS*) is less straightforward. -The most efficient way to handle this is to duplicate each character -with its lowercase version and then the original character. - -.. code-block:: python - - >>> import itertools - >>> def groupletters(x): - ... return ''.join(itertools.chain.from_iterable((remove_case(y), y) for y in x)) - ... - >>> groupletters('Apple') - 'aAppppllee' - >>> groupletters('apple') - 'aappppllee' - >>> sorted(a, key=groupletters) - ['Apple', 'apple', 'Banana', 'banana', 'Corn', 'corn'] - -The effect of this is that both ``'Apple'`` and ``'apple'`` are -placed adjacent to each other because their transformations both begin -with ``'a'``, and then the second character can be used to order them -appropriately with respect to each other. - -There's a problem with this, though. Within the context of :mod:`natsort` -we are trying to correctly sort numbers and those should be left alone. - -.. code-block:: python - - >>> a = ['Apple5', 'apple', 'Apple4E10', 'Banana'] - >>> sorted(a, key=lambda x: natsort_key(x, as_float=True)) - ['Apple5', 'Apple4E10', 'Banana', 'apple'] - >>> sorted(a, key=lambda x: natsort_key(groupletters(x), as_float=True)) - ['Apple4E10', 'Apple5', 'apple', 'Banana'] - >>> groupletters('Apple4E10') - 'aAppppllee44eE1100' - -We messed up the numbers! Looks like :func:`groupletters` needs to be applied -*after* the strings are broken into their components. I'm not going to show -how this is done here, but basically it requires applying the function in -the ``else:`` block of :func:`coerce_to_int`/:func:`coerce_to_float`. - -.. code-block:: python - - >>> better_groupletters = natsort_keygen(alg=ns.GROUPLETTERS | ns.REAL) - >>> better_groupletters('Apple4E10') - ('aAppppllee', 40000000000.0) - >>> sorted(a, key=better_groupletters) - ['Apple5', 'Apple4E10', 'apple', 'Banana'] - -Of course, applying both *LOWERCASEFIRST* and *GROUPLETTERS* is just -a matter of turning on both functions. - -Basic Unicode Support -+++++++++++++++++++++ - -Unicode is hard and complicated. Here's an example. - -.. code-block:: python - - >>> b = [b'\x66', b'\x65', b'\xc3\xa9', b'\x65\xcc\x81', b'\x61', b'\x7a'] - >>> a = [x.decode('utf8') for x in b] - >>> a # doctest: +SKIP - ['f', 'e', 'é', 'é', 'a', 'z'] - >>> sorted(a) # doctest: +SKIP - ['a', 'e', 'é', 'f', 'z', 'é'] - - -There are more than one way to represent the character 'é' in Unicode. -In fact, many characters have multiple representations. This is a challenge -because comparing the two representations would return ``False`` even though -they *look* the same. - -.. code-block:: python - - >>> a[2] == a[3] - False - -Alas, since characters are compared based on the numerical value of their -representation, sorting Unicode often gives unexpected results (like seeing -'é' come both *before* and *after* 'z'). - -The original approach that :mod:`natsort` took with respect to non-ASCII -Unicode characters was to say "just use -the :mod:`locale` or :mod:`PyICU` library" and then cross it's fingers -and hope those libraries take care of it. As you will find in the following -sections, that comes with its own baggage, and turned out to not always work anyway -(see https://stackoverflow.com/q/45734562/1399279). A more robust approach is to -handle the Unicode out-of-the-box without invoking a heavy-handed library -like :mod:`locale` or :mod:`PyICU`. To do this, we must use *normalization*. - -To fully understand Unicode normalization, `check out some official Unicode documentation`_. -Just kidding... that's too much text. The following StackOverflow answers do -a good job at explaining Unicode normalization in simple terms: -https://stackoverflow.com/a/7934397/1399279 and -https://stackoverflow.com/a/7931547/1399279. Put simply, normalization -ensures that Unicode characters with multiple representations are in -some canonical and consistent representation so that (for example) comparisons -of the characters can be performed in a sane way. The following discussion -assumes you at least read the StackOverflow answers. - -Looking back at our 'é' example, we can see that the two versions were -constructed with the byte strings ``b'\xc3\xa9'`` and ``b'\x65\xcc\x81'``. -The former representation is actually -`LATIN SMALL LETTER E WITH ACUTE `_ -and is a single character in the Unicode standard. This is known as the -*compressed form* and corresponds to the 'NFC' normalization scheme. -The latter representation is actually the letter 'e' followed by -`COMBINING ACUTE ACCENT `_ -and so is two characters in the Unicode standard. This is known as the -*decompressed form* and corresponds to the 'NFD' normalization scheme. -Since the first character in the decompressed form is actually the letter 'e', -when compared to other ASCII characters it fits where you might expect. -Unfortunately, all Unicode compressed form characters come after the -ASCII characters and so they always will be placed after 'z' when sorting. - -It seems that most Unicode data is stored and shared in the compressed form -which makes it challenging to sort. This can be solved by normalizing all -incoming Unicode data to the decompressed form ('NFD') and *then* sorting. - -.. code-block:: python - - >>> import unicodedata - >>> c = [unicodedata.normalize('NFD', x) for x in a] - >>> c # doctest: +SKIP - ['f', 'e', 'é', 'é', 'a', 'z'] - >>> sorted(c) # doctest: +SKIP - ['a', 'e', 'é', 'é', 'f', 'z'] - -Huzzah! Sane sorting without having to resort to :mod:`locale`! - -Using Locale to Compare Strings -+++++++++++++++++++++++++++++++ - -The :mod:`locale` module is actually pretty cool, and provides lowly -spare-time programmers like myself a way to handle the daunting task -of proper locale-dependent support of their libraries and utilities. -Having said that, it can be a bit of a bear to get right, -`although they do point out in the documentation that it will be painful to use`_. -Aside from the caveats spelled out in that link, it turns out that just -comparing strings with :mod:`locale` in a cross-platform and -cross-python-version manner is not as straightforward as one might hope. - -First, how to use :mod:`locale` to compare strings? It's actually -pretty straightforward. Simply run the input through the :mod:`locale` -transformation function :func:`locale.strxfrm`. - -.. code-block:: python - - >>> import locale, sys - >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') - 'en_US.UTF-8' - >>> a = ['a', 'b', 'ä'] - >>> sorted(a) - ['a', 'b', 'ä'] - >>> # The below fails on OSX, so don't run doctest on darwin. - >>> is_osx = sys.platform == 'darwin' - >>> sorted(a, key=locale.strxfrm) if not is_osx else ['a', 'ä', 'b'] - ['a', 'ä', 'b'] - >>> - >>> a = ['apple', 'Banana', 'banana', 'Apple'] - >>> sorted(a, key=locale.strxfrm) if not is_osx else ['apple', 'Apple', 'banana', 'Banana'] - ['apple', 'Apple', 'banana', 'Banana'] - -It turns out that locale-aware sorting groups numbers in the same -way as turning on *GROUPLETTERS* and *LOWERCASEFIRST*. -The trick is that you have to apply :func:`locale.strxfrm` only to non-numeric -characters; otherwise, numbers won't be parsed properly. Therefore, it must -be applied as part of the :func:`coerce_to_int`/:func:`coerce_to_float` -functions in a manner similar to :func:`groupletters`. - -As you might have guessed, there is a small problem. -It turns out the there is a bug in the legacy Python implementation of -:func:`locale.strxfrm` that causes it to outright fail for :func:`unicode` -input (https://bugs.python.org/issue2481). :func:`locale.strcoll` works, -but is intended for use with ``cmp``, which does not exist in current Python -implementations. Luckily, the :func:`functools.cmp_to_key` function -makes :func:`locale.strcoll` behave like :func:`locale.strxfrm` (that is, of course, -unless you are on Python 2.6 where :func:`functools.cmp_to_key` doesn't exist, -in which case you simply copy-paste the implementation from Python 2.7 -directly into your code ☹). - -Handling Broken Locale On OSX -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -But what if the underlying *locale* implementation that :mod:`locale` -relies upon is simply broken? It turns out that the *locale* library on -OSX (and other BSD systems) is broken (and for some reason has never been -fixed?), and so :mod:`locale` does not work as expected. - -How do I define doesn't work as expected? - -.. code-block:: python - - >>> a = ['apple', 'Banana', 'banana', 'Apple'] - >>> sorted(a) - ['Apple', 'Banana', 'apple', 'banana'] - >>> - >>> sorted(a, key=locale.strxfrm) if is_osx else sorted(a) - ['Apple', 'Banana', 'apple', 'banana'] - -IT'S SORTING AS IF :func:`locale.stfxfrm` WAS NEVER USED!! (and it's worse -once non-ASCII characters get thrown into the mix.) I'm really not -sure why this is considered OK for the OSX/BSD maintainers to not fix, -but it's more than frustrating for poor developers who have been dragged -into the *locale* game kicking and screaming. **. - -So, how to deal with this situation? There are two ways to do so. - -#. Detect if :mod:`locale` is sorting incorrectly (i.e. ``dumb``) by seeing - if ``'A'`` is sorted before ``'a'`` (incorrect) or not. - - .. code-block:: python - - >>> # This is genuinely the name of this function. - >>> # See natsort.compat.locale.py - >>> def dumb_sort(): - ... return locale.strxfrm('A') < locale.strxfrm('a') - ... - - If a ``dumb`` *locale* implementation is found, then automatically - turn on *LOWERCASEFIRST* and *GROUPLETTERS*. -#. Use an alternate library if installed. `ICU `_ - is a great and powerful library that has a pretty decent Python port - called (you guessed it) `PyICU `_. - If a user has this library installed on their computer, :mod:`natsort` - chooses to use that instead of :mod:`locale`. With a little bit of - planning, one can write a set of wrapper functions that call - the correct library under the hood such that the business logic never - has to know what library is being used (see `natsort.compat.locale.py`_). - -Let me tell you, this little complication really makes a challenge of testing -the code, since one must set up different environments on different operating -systems in order to test all possible code paths. Not to mention that -certain checks *will* fail for certain operating systems and environments -so one must be diligent in either writing the tests not to fail, or ignoring -those tests when on offending environments. - -Handling Locale-Aware Numbers -+++++++++++++++++++++++++++++ - -`Thousands separator support`_ is a problem that I knew would someday be -requested but had decided to push off until a rainy day. One day it finally -rained, and I decided to tackle the problem. - -So what is the problem? Consider the number ``1,234,567`` (assuming the -``','`` is the thousands separator). Try to run that through :func:`int` -and you will get a :exc:`ValueError`. To handle this properly the thousands -separators must be removed. - -.. code-block:: python - - >>> float('1,234,567'.replace(',', '')) - 1234567.0 - -What if, in our current locale, the thousands separator is ``'.'`` and -the ``','`` is the decimal separator (like for the German locale *de_DE*)? - -.. code-block:: python - - >>> float('1.234.567'.replace('.', '').replace(',', '.')) - 1234567.0 - >>> float('1.234.567,89'.replace('.', '').replace(',', '.')) - 1234567.89 - -This is pretty much what :func:`locale.atoi` and :func:`locale.atof` do -under the hood. So what's the problem? Why doesn't :mod:`natsort` just -use this method under its hood? -Well, let's take a look at what would happen if we send some possible -:mod:`natsort` input through our the above function: - -.. code-block:: python - - >>> natsort_key('1,234 apples, please.'.replace(',', '')) - ('', 1234, ' apples please.') - >>> natsort_key('Sir, €1.234,50 please.'.replace('.', '').replace(',', '.'), as_float=True) - ('Sir. €', 1234.5, ' please') - -Any character matching the thousands separator was dropped, and anything -matching the decimal separator was changed to ``'.'``! If these characters -were critical to how your data was ordered, this would break :mod:`natsort`. - -The first solution one might consider would be to first decompose the -input into sub-components (like we did for the *GROUPLETTERS* method -above) and then only apply these transformations on the number components. -This is a chicken-and-egg problem, though, because *we cannot appropriately -separate out the numbers because of the thousands separators and -non-'.' decimal separators* (well, at least not without making multiple -passes over the data which I do not consider to be a valid option). - -Regular expressions to the rescue! With regular expressions, we can -remove the thousands separators and change the decimal separator only -when they are actually within a number. Once the input has been -pre-processed with this regular expression, all the infrastructure -shown previously will work. - -Beware, these regular expressions will make your eyes bleed. - -.. code-block:: python - - >>> decimal = ',' # Assume German locale, so decimal separator is ',' - >>> # Look-behind assertions cannot accept range modifiers, so instead of i.e. - >>> # (?>> nodecimal = r'(?>> strip_thousands = r''' - ... (?<=[0-9]{{1}}) # At least 1 number - ... (?>> re.sub(strip_thousands, '', 'Sir, €1.234,50 please.', flags=re.X) - 'Sir, €1234,50 please.' - >>> - >>> # The decimal point must be preceded by a number or after - >>> # a number. This option only needs to be performed in the - >>> # case when the decimal separator for the locale is not '.'. - >>> switch_decimal = r'(?<=[0-9]){decimal}|{decimal}(?=[0-9])' - >>> switch_decimal = switch_decimal.format(decimal=decimal) - >>> re.sub(switch_decimal, '.', 'Sir, €1234,50 please.', flags=re.X) - 'Sir, €1234.50 please.' - >>> - >>> natsort_key('Sir, €1234.50 please.', as_float=True) - ('Sir, €', 1234.5, ' please.') - -Final Thoughts --------------- - -My hope is that users of :mod:`natsort` never have to think about or worry -about all the bookkeeping or any of the details described above, and that using -:mod:`natsort` seems to magically "just work". For those of you who -took the time to read this engineering description, I hope it has enlightened -you to some of the issues that can be encountered when code is released -into the wild and has to accept "real-world data", or to what happens -to developers who naïvely make bold assumptions that are counter to -what the rest of the world assumes. - -.. rubric:: Footnotes - -.. [#f1] - To anyone looking through the actual code, you will note that I don't - actually use :mod:`pathlib` to split the paths... I wrote my own version - to avoid adding an external dependency of :mod:`pathlib` on Python < 3.4. -.. [#f2] - *"But if you hadn't removed the leading empty string from re.split this - wouldn't have happened!!"* I can hear you saying. Well, that's true. I don't - have a *great* reason for having done that except that in an earlier - non-optimal incarnation of the algorithm I needed to it, and it kind of - stuck, and it made other parts of the code easier if the assumption that - there were no empty strings was valid. -.. [#f3] - I'm not going to show how this is implemented in this document, - but if you are interested you can look at the code to - :func:`sep_inserter` in `util.py`_. -.. [#f4] - Handling each of these is straightforward, but coupled with the rapidly - fracturing execution paths presented in :ref:`TL;DR 2 ` one can imagine - this will get out of hand quickly. If you take a look at `natsort.py`_ and - `util.py`_ you can observe that to avoid this I take a more functional approach - to construting the :mod:`natsort` algorithm as opposed to the procedural approach - illustrated in :ref:`TL;DR 1 ` and :ref:`TL;DR 2 `. - -.. _ASCII table: http://www.asciitable.com/ -.. _getting sorting right is surprisingly hard: http://www.compciv.org/guides/python/fundamentals/sorting-collections-with-sorted/ -.. _This astonished: https://github.com/SethMMorton/natsort/issues/19 -.. _a lot: http://stackoverflow.com/questions/29548742/python-natsort-sort-strings-recursively -.. _of people: http://stackoverflow.com/questions/24045348/sort-set-of-numbers-in-the-form-xx-yy-in-python -.. _and some people aren't very nice when they are astonished: - https://github.com/xolox/python-naturalsort/blob/ed3e6b6ffaca3bdea3b76e08acbb8bd2a5fee463/README.rst#why-another-natsort-module -.. _fastnumbers: https://github.com/SethMMorton/fastnumbers -.. _as part of my testing: https://github.com/SethMMorton/natsort/blob/master/test_natsort/slow_splitters.py -.. _this one for coercion: http://stackoverflow.com/questions/736043/checking-if-a-string-can-be-converted-to-float-in-python -.. _this one for checking: http://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float-in-python -.. _most natural sort solutions for python on Stack Overflow: http://stackoverflow.com/q/4836710/1399279 -.. _80%/20%: https://en.wikipedia.org/wiki/Pareto_principle -.. _The first major special case I encountered was sorting filesystem paths: https://github.com/SethMMorton/natsort/issues/3 -.. _The second major special case I encountered was sorting of different types: https://github.com/SethMMorton/natsort/issues/7 -.. _A rather unexpected special case I encountered was sorting collections containing NaN: - https://github.com/SethMMorton/natsort/issues/27 -.. _Path.parts: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.parts -.. _Path.suffixes: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffixes -.. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem -.. _It's hard to compare floating point numbers: http://www.drdobbs.com/cpp/its-hard-to-compare-floating-point-numbe/240149806 -.. _caught a bit off guard when the request was initially made: https://github.com/SethMMorton/natsort/issues/14 -.. _at the code: https://github.com/SethMMorton/natsort/tree/master/natsort -.. _natsort.py: https://github.com/SethMMorton/natsort/blob/master/natsort/natsort.py -.. _util.py: https://github.com/SethMMorton/natsort/blob/master/natsort/util.py -.. _although they do point out in the documentation that it will be painful to use: - https://docs.python.org/3/library/locale.html#background-details-hints-tips-and-caveats -.. _natsort.compat.locale.py: https://github.com/SethMMorton/natsort/blob/master/natsort/compat/locale.py -.. _Thousands separator support: https://github.com/SethMMorton/natsort/issues/36 -.. _really good: https://hypothesis.readthedocs.io/en/latest/ -.. _testing strategy: http://doc.pytest.org/en/latest/ -.. _check out some official Unicode documentation: http://unicode.org/reports/tr15/ diff --git a/docs/source/humansorted.rst b/docs/source/humansorted.rst deleted file mode 100644 index 35be19b..0000000 --- a/docs/source/humansorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.humansorted` -============================ - -.. autofunction:: humansorted - diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 9d7c81b..0000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. natsort documentation master file, created by - sphinx-quickstart on Thu Jul 17 21:01:29 2014. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -natsort: Simple yet flexible natural sorting in Python. -======================================================= - -Contents: - -.. toctree:: - :maxdepth: 2 - :numbered: - - intro.rst - howitworks.rst - examples.rst - api.rst - shell.rst - changelog.rst - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/docs/source/index_humansorted.rst b/docs/source/index_humansorted.rst deleted file mode 100644 index e143b67..0000000 --- a/docs/source/index_humansorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.index_humansorted` -================================== - -.. autofunction:: index_humansorted - diff --git a/docs/source/index_natsorted.rst b/docs/source/index_natsorted.rst deleted file mode 100644 index ea48f25..0000000 --- a/docs/source/index_natsorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.index_natsorted` -================================ - -.. autofunction:: index_natsorted - diff --git a/docs/source/index_realsorted.rst b/docs/source/index_realsorted.rst deleted file mode 100644 index 215c3e9..0000000 --- a/docs/source/index_realsorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.index_realsorted` -================================= - -.. autofunction:: index_realsorted - diff --git a/docs/source/index_versorted.rst b/docs/source/index_versorted.rst deleted file mode 100644 index 07e266f..0000000 --- a/docs/source/index_versorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.index_versorted` -================================ - -.. autofunction:: index_versorted - diff --git a/docs/source/intro.rst b/docs/source/intro.rst deleted file mode 100644 index fb09294..0000000 --- a/docs/source/intro.rst +++ /dev/null @@ -1,397 +0,0 @@ -.. default-domain:: py -.. module:: natsort - -The :mod:`natsort` module -========================= - -Simple yet flexible natural sorting in Python. - - - Source Code: https://github.com/SethMMorton/natsort - - Downloads: https://pypi.org/project/natsort/ - - Documentation: http://natsort.readthedocs.io/ - - Optional Dependencies: - - - `fastnumbers `_ >= 2.0.0 - - `PyICU `_ >= 1.0.0 - -:mod:`natsort` is a general utility for sorting lists *naturally*; the definition -of "naturally" is not well-defined, but the most common definition is that numbers -contained within the string should be sorted as numbers and not as you would -other characters. If you need to present sorted output to a user, you probably -want to sort it naturally. - -:mod:`natsort` was initially created for sorting scientific output filenames that -contained signed floating point numbers in the names. There was a lack of -algorithms out there that could perform a natural sort on `floats` but -plenty for `ints`; check out -`this StackOverflow question `_ -and its answers and links therein, -`this ActiveState forum `_, -and of course `this great article on natural sorting `_ -from CodingHorror.com for examples of what I mean. -:mod:`natsort` was created to fill in this gap, but has since expanded to handle -just about any definition of a number, as well as other sorting customizations. - -Quick Description ------------------ - -When you try to sort a list of strings that contain numbers, the normal python -sort algorithm sorts lexicographically, so you might not get the results that you -expect: - -.. code-block:: python - - >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] - >>> sorted(a) - ['1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '2 ft 7 in', '7 ft 6 in'] - -Notice that it has the order ('1', '10', '2') - this is because the list is -being sorted in lexicographical order, which sorts numbers like you would -letters (i.e. 'b', 'ba', 'c'). - -:mod:`natsort` provides a function :func:`~natsorted` that helps sort lists -"naturally" ("naturally" is rather ill-defined, but in general it means -sorting based on meaning and not computer code point).. -Using :func:`~natsorted` is simple: - -.. code-block:: python - - >>> from natsort import natsorted - >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] - >>> natsorted(a) - ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] - -:func:`~natsorted` identifies numbers anywhere in a string and sorts them -naturally. Below are some other things you can do with :mod:`natsort` -(please see the :ref:`examples` for a quick start guide, or the :ref:`api` -for more details). - -.. note:: - - :func:`~natsorted` is designed to be a drop-in replacement for the built-in - :func:`sorted` function. Like :func:`sorted`, :func:`~natsorted` - `does not sort in-place`. To sort a list and assign the output to the - same variable, you must explicitly assign the output to a variable: - - .. code-block:: python - - >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] - >>> natsorted(a) - ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] - >>> print(a) # 'a' was not sorted; "natsorted" simply returned a sorted list - ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] - >>> a = natsorted(a) # Now 'a' will be sorted because the sorted list was assigned to 'a' - >>> print(a) - ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] - - Please see `Generating a Reusable Sorting Key and Sorting In-Place`_ for - an alternate way to sort in-place naturally. - -Examples --------- - -Sorting Versions -++++++++++++++++ - -This is handled properly by default (as of :mod:`natsort` version >= 4.0.0): - -.. code-block:: python - - >>> a = ['version-1.9', 'version-2.0', 'version-1.11', 'version-1.10'] - >>> natsorted(a) - ['version-1.9', 'version-1.10', 'version-1.11', 'version-2.0'] - -If you need to sort release candidates, please see :ref:`rc_sorting` for -a useful hack. - -Sorting by Real Numbers (i.e. Signed Floats) -++++++++++++++++++++++++++++++++++++++++++++ - -This is useful in scientific data analysis and was -the default behavior of :func:`~natsorted` for :mod:`natsort` -version < 4.0.0. Use the :func:`~realsorted` function: - -.. code-block:: python - - >>> from natsort import realsorted, ns - >>> # Note that when interpreting as signed floats, the below numbers are - >>> # +5.10, -3.00, +5.30, +2.00 - >>> a = ['position5.10.data', 'position-3.data', 'position5.3.data', 'position2.data'] - >>> natsorted(a) - ['position2.data', 'position5.3.data', 'position5.10.data', 'position-3.data'] - >>> natsorted(a, alg=ns.REAL) - ['position-3.data', 'position2.data', 'position5.10.data', 'position5.3.data'] - >>> realsorted(a) # shortcut for natsorted with alg=ns.REAL - ['position-3.data', 'position2.data', 'position5.10.data', 'position5.3.data'] - -Locale-Aware Sorting (or "Human Sorting") -+++++++++++++++++++++++++++++++++++++++++ - -This is where the non-numeric characters are ordered based on their meaning, -not on their ordinal value, and a locale-dependent thousands separator and decimal -separator is accounted for in the number. -This can be achieved with the :func:`~humansorted` function: - -.. code-block:: python - - >>> a = ['Apple', 'apple15', 'Banana', 'apple14,689', 'banana'] - >>> natsorted(a) - ['Apple', 'Banana', 'apple14,689', 'apple15', 'banana'] - >>> import locale - >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') - 'en_US.UTF-8' - >>> natsorted(a, alg=ns.LOCALE) - ['apple15', 'apple14,689', 'Apple', 'banana', 'Banana'] - >>> from natsort import humansorted - >>> humansorted(a) - ['apple15', 'apple14,689', 'Apple', 'banana', 'Banana'] - -You may find you need to explicitly set the locale to get this to work -(as shown in the example). -Please see :ref:`locale_issues` and the Installation section -below before using the :func:`~humansorted` function. - -Further Customizing Natsort -+++++++++++++++++++++++++++ - -If you need to combine multiple algorithm modifiers (such as ``ns.REAL``, -``ns.LOCALE``, and ``ns.IGNORECASE``), you can combine the options using the -bitwise OR operator (``|``). For example, - -.. code-block:: python - - >>> a = ['Apple', 'apple15', 'Banana', 'apple14,689', 'banana'] - >>> natsorted(a, alg=ns.REAL | ns.LOCALE | ns.IGNORECASE) - ['Apple', 'apple15', 'apple14,689', 'Banana', 'banana'] - >>> # The ns enum provides long and short forms for each option. - >>> ns.LOCALE == ns.L - True - >>> # You can also customize the convenience functions, too. - >>> natsorted(a, alg=ns.REAL | ns.LOCALE | ns.IGNORECASE) == realsorted(a, alg=ns.L | ns.IC) - True - >>> natsorted(a, alg=ns.REAL | ns.LOCALE | ns.IGNORECASE) == humansorted(a, alg=ns.R | ns.IC) - True - -All of the available customizations can be found in the documentation for -the :class:`~natsort.ns` enum. - -You can also add your own custom transformation functions with the ``key`` argument. -These can be used with ``alg`` if you wish: - -.. code-block:: python - - >>> a = ['apple2.50', '2.3apple'] - >>> natsorted(a, key=lambda x: x.replace('apple', ''), alg=ns.REAL) - ['2.3apple', 'apple2.50'] - -Sorting Mixed Types -+++++++++++++++++++ - -You can mix and match ``int``, ``float``, and ``str`` (or ``unicode``) types -when you sort: - -.. code-block:: python - - >>> a = ['4.5', 6, 2.0, '5', 'a'] - >>> natsorted(a) - [2.0, '4.5', '5', 6, 'a'] - >>> # On Python 2, sorted(a) would return [2.0, 6, '4.5', '5', 'a'] - >>> # On Python 3, sorted(a) would raise an "unorderable types" TypeError - -Handling Bytes on Python 3 -++++++++++++++++++++++++++ - -:mod:`natsort` does not officially support the `bytes` type on Python 3, but -convenience functions are provided that help you decode to `str` first: - -.. code-block:: python - - >>> from natsort import as_utf8 - >>> a = [b'a', 14.0, 'b'] - >>> # On Python 2, natsorted(a) would would work as expected. - >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str()) - >>> natsorted(a, key=as_utf8) == [14.0, b'a', 'b'] - True - >>> a = [b'a56', b'a5', b'a6', b'a40'] - >>> # On Python 2, natsorted(a) would would work as expected. - >>> # On Python 3, natsorted(a) would return the same results as sorted(a) - >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56'] - True - -Generating a Reusable Sorting Key and Sorting In-Place -++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -Under the hood, :func:`~natsorted` works by generating a custom sorting -key using :func:`~natsort_keygen` and then passes that to the built-in -:func:`sorted`. You can use the :func:`~natsort_keygen` function yourself to -generate a custom sorting key to sort in-place using the :meth:`list.sort` -method. - -.. code-block:: python - - >>> from natsort import natsort_keygen - >>> natsort_key = natsort_keygen() - >>> a = ['2 ft 7 in', '1 ft 5 in', '10 ft 2 in', '2 ft 11 in', '7 ft 6 in'] - >>> natsorted(a) == sorted(a, key=natsort_key) - True - >>> a.sort(key=natsort_key) - >>> a - ['1 ft 5 in', '2 ft 7 in', '2 ft 11 in', '7 ft 6 in', '10 ft 2 in'] - -All of the algorithm customizations mentioned in the `Further Customizing Natsort`_ -section can also be applied to :func:`~natsort_keygen` through the *alg* keyword option. - -Other Useful Things -+++++++++++++++++++ - - - recursively descend into lists of lists - - automatic unicode normalization of input data - - controlling the case-sensitivity (see :ref:`case_sort`) - - sorting file paths correctly (see :ref:`path_sort`) - - allow custom sorting keys (see :ref:`custom_sort`) - -FAQ ---- - -How do I debug :func:`~natsorted`? - The best way to debug :func:`~natsorted` is to generate a key using :func:`~natsort_keygen` - with the same options being passed to :func:`~natsorted`. One can take a look at - exactly what is being done with their input using this key - it is highly recommended - to `look at this issue describing how to debug `_ - for *how* to debug, and also to review the - `How Does Natsort Work? `_ - page for *why* :mod:`natsort` is doing that to your data. - - If you are trying to sort custom classes and running into trouble, please take a look at - https://github.com/SethMMorton/natsort/issues/60. In short, - custom classes are not likely to be sorted correctly if one relies - on the behavior of ``__lt__`` and the other rich comparison operators in their - custom class - it is better to use a ``key`` function with :mod:`natsort`, or - use the :mod:`natsort` key as part of your rich comparison operator definition. - -How *does* :mod:`natsort` work? - If you don't want to read `How Does Natsort Work? `_, - here is a quick primer. - - :mod:`natsort` provides a `key function `_ - that can be passed to `list.sort() `_ - or `sorted() `_ in order to - modify the default sorting behavior. This key is generated on-demand with the - key generator :func:`natsort.natsort_keygen`. :func:`natsort.natsorted` is essentially - a wrapper for the following code: - - .. code-block:: python - - >>> from natsort import natsort_keygen - >>> natsort_key = natsort_keygen() - >>> sorted(['1', '10', '2'], key=natsort_key) - ['1', '2', '10'] - - Users can further customize :mod:`natsort` sorting behavior with the ``key`` - and/or ``alg`` options (see details in the `Further Customizing Natsort`_ - section). - - The key generated by :func:`natsort.natsort_keygen` *always* returns a :class:`tuple`. It - does so in the following way (*some details omitted for clarity*): - - 1. Assume the input is a string, and attempt to split it into numbers and - non-numbers using regular expressions. Numbers are then converted into - either :class:`int` or :class:`float`. - 2. If the above fails because the input is not a string, assume the input - is some other sequence (e.g. :class:`list` or :class:`tuple`), and recursively - apply the key to each element of the sequence. - 3. If the above fails because the input is not iterable, assume the input - is an :class:`int` or :class:`float`, and just return the input in a :class:`tuple`. - - Because a :class:`tuple` is always returned, a :exc:`TypeError` should not be common - unless one tries to do something odd like sort an :class:`int` against a :class:`list`. - -:mod:`natsort` gave me results I didn't expect, and it's a terrible library! - Did you try to debug using the above advice? If so, and you still cannot figure out - the error, then please `file an issue `_. - -Shell script ------------- - -:mod:`natsort` comes with a shell script called :mod:`natsort`, or can also be called -from the command line with ``python -m natsort``. - -Requirements ------------- - -:mod:`natsort` requires Python version 2.6 or greater or Python 3.3 or greater. -It may run on (but is not tested against) Python 3.2. - -Optional Dependencies ---------------------- - -fastnumbers -+++++++++++ - -The most efficient sorting can occur if you install the -`fastnumbers `_ package -(version >=2.0.0); it helps with the string to number conversions. -:mod:`natsort` will still run (efficiently) without the package, but if you need -to squeeze out that extra juice it is recommended you include this as a dependency. -:mod:`natsort` will not require (or check) that -`fastnumbers `_ is installed -at installation. - -PyICU -+++++ - -It is recommended that you install `PyICU `_ -if you wish to sort in a locale-dependent manner, see -http://natsort.readthedocs.io/en/master/locale_issues.html for an explanation why. - -Installation ------------- - -Use ``pip``! - -.. code-block:: sh - - $ pip install natsort - -If you want to install the `Optional Dependencies`_, you can use the -`"extras" notation `_ -at installation time to install those dependencies as well - use ``fast`` for -`fastnumbers `_ and ``icu`` for -`PyICU `_. - -.. code-block:: sh - - # Install both optional dependencies. - $ pip install natsort[fast,icu] - # Install just fastnumbers - $ pip install natsort[fast] - -How to Run Tests ----------------- - -Please note that :mod:`natsort` is NOT set-up to support ``python setup.py test``. - -The recommended way to run tests is with `tox `_. -After installing ``tox``, running tests is as simple as executing the following in the -``natsort`` directory: - -.. code-block:: sh - - $ tox - -``tox`` will create virtual a virtual environment for your tests and install all the -needed testing requirements for you. You can specify a particular python version -with the ``-e`` flag, e.g. ``tox -e py36``. - -If you do not wish to use ``tox``, you can install the testing dependencies and run the -tests manually using `pytest `_ - ``natsort`` -contains a ``Pipfile`` for use with `pipenv `_ that -makes it easy for you to install the testing dependencies: - -.. code-block:: sh - - $ pipenv install --skip-lock --dev - $ pipenv run python -m pytest - -Note that above I invoked ``python -m pytest`` instead of just ``pytest`` - this is because -`the former puts the CWD on sys.path `_. \ No newline at end of file diff --git a/docs/source/locale_issues.rst b/docs/source/locale_issues.rst deleted file mode 100644 index 48fa676..0000000 --- a/docs/source/locale_issues.rst +++ /dev/null @@ -1,96 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -.. _locale_issues: - -Possible Issues with :func:`~natsort.humansorted` or ``ns.LOCALE`` -================================================================== - -Being Locale-Aware Means Both Numbers and Non-Numbers ------------------------------------------------------ - -In addition to modifying how characters are sorted, ``ns.LOCALE`` will take into -account locale-dependent thousands separators (and locale-dependent decimal -separators if ``ns.FLOAT`` is enabled). This means that if you are in a -locale that uses commas as the thousands separator, a number like -``123,456`` will be interpreted as ``123456``. If this is not what you want, -you may consider using ``ns.LOCALEALPHA`` which will only enable locale-aware -sorting for non-numbers (similarly, ``ns.LOCALENUM`` enables locale-aware -sorting only for numbers). - -Regenerate Key With :func:`~natsort.natsort_keygen` After Changing Locale -------------------------------------------------------------------------- - -When :func:`~natsort.natsort_keygen` is called it returns a key function that -hard-codes the provided settings. This means that the key returned when -``ns.LOCALE`` is used contins the settings specifed by the locale -*loaded at the time the key is generated*. If you change the locale, -you should regenerate the key to account for the new locale. - -Corollary: Do Not Reuse :func:`~natsort.natsort_keygen` After Changing Locale -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -If you change locale, the old function will not work as expected. -The `locale `_ library works -with a global state. When :func:`~natsort.natsort_keygen` is called it does the -best job that it can to make the returned function as static as possible and -independent of the global state, but the -`strxfrm `_ -function must access this global state to work; therefore, if you change -locale and use ``ns.LOCALE`` then you should discard the old key. - -.. note:: If you use `PyICU `_ then you - may be able to reuse keys after changing locale. - -The `locale `_ Module From the StdLib Has Issues -------------------------------------------------------------------------------------------------- - -:mod:`natsort` will use `PyICU `_ for -:func:`~natsort.humansorted` or ``ns.LOCALE`` if it is installed. If not, -it will fall back on the `locale `_ -library from the Python stdlib. If you do not have -`PyICU `_ installed, please keep the -following known problems and issues in mind. - -.. note:: Remember, if you have `PyICU `_ - installed you shouldn't need to worry about any of these. - -Explicitly Set the Locale Before Using :func:`~natsort.humansorted` or ``ns.LOCALE`` -++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -I have found that unless you explicitly set a locale, the sorted order may not -be what you expect. Setting this is straightforward -(in the below example I use 'en_US.UTF-8', but you should use your -locale):: - - >>> import locale - >>> locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') - 'en_US.UTF-8' - -.. _bug_note: - -The `locale `_ Module Is Broken on Mac OS X -++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -It's not Python's fault, but the OS... the locale library for BSD-based systems -(of which Mac OS X is one) is broken. See the following links: - - - http://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help - - http://bugs.python.org/issue23195 - - https://github.com/SethMMorton/natsort/issues/21 (contains instructons on installing) - - http://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm - - https://github.com/SethMMorton/natsort/issues/34 - -Of course, installing `PyICU `_ fixes this, -but if you don't want to or cannot install this there is some hope. - - 1. As of ``natsort`` version 4.0.0, ``natsort`` is configured - to compensate for a broken ``locale`` library. When sorting non-numbers - it will handle case as you expect, but it will still not be able to - comprehend non-ASCII characters properly. Additionally, it has - a built-in lookup table of thousands separators that are incorrect - on OS X/BSD (but is possible it is not complete... please file an - issue if you see it is not complete) - 2. Use "\*.ISO8859-1" locale (i.e. 'en_US.ISO8859-1') rather than "\*.UTF-8" - locale. I have found that these have fewer issues than "UTF-8", but - your mileage may vary. diff --git a/docs/source/natsort_key.rst b/docs/source/natsort_key.rst deleted file mode 100644 index 351b351..0000000 --- a/docs/source/natsort_key.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.natsort_key` -============================ - -.. autofunction:: natsort_key - diff --git a/docs/source/natsort_keygen.rst b/docs/source/natsort_keygen.rst deleted file mode 100644 index b0d5988..0000000 --- a/docs/source/natsort_keygen.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.natsort_keygen` -=============================== - -.. autofunction:: natsort_keygen - diff --git a/docs/source/natsorted.rst b/docs/source/natsorted.rst deleted file mode 100644 index 30b5692..0000000 --- a/docs/source/natsorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.natsorted` -========================== - -.. autofunction:: natsorted - diff --git a/docs/source/ns_class.rst b/docs/source/ns_class.rst deleted file mode 100644 index f604e3d..0000000 --- a/docs/source/ns_class.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:class:`~natsort.ns` -==================== - -.. autoclass:: ns - diff --git a/docs/source/order_by_index.rst b/docs/source/order_by_index.rst deleted file mode 100644 index b1d7681..0000000 --- a/docs/source/order_by_index.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.order_by_index` -=============================== - -.. autofunction:: order_by_index - diff --git a/docs/source/realsorted.rst b/docs/source/realsorted.rst deleted file mode 100644 index 5af5b12..0000000 --- a/docs/source/realsorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.realsorted` -=========================== - -.. autofunction:: realsorted - diff --git a/docs/source/shell.rst b/docs/source/shell.rst deleted file mode 100644 index 953c423..0000000 --- a/docs/source/shell.rst +++ /dev/null @@ -1,147 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -.. _shell: - -Shell Script -============ - -The ``natsort`` shell script is automatically installed when you install -:mod:`natsort` with pip. - -Below is the usage and some usage examples for the ``natsort`` shell script. - -Usage ------ - -:: - - usage: natsort [-h] [--version] [-p] [-f LOW HIGH] [-F LOW HIGH] [-e EXCLUDE] - [-r] [-t {digit,int,float,version,ver}] [--nosign] [--noexp] - [--locale] - [entries [entries ...]] - - Performs a natural sort on entries given on the command-line. - A natural sort sorts numerically then alphabetically, and will sort - by numbers in the middle of an entry. - - positional arguments: - entries The entries to sort. Taken from stdin if nothing is - given on the command line. - - optional arguments: - -h, --help show this help message and exit - --version show program's version number and exit - -p, --paths Interpret the input as file paths. This is not - strictly necessary to sort all file paths, but in - cases where there are OS-generated file paths like - "Folder/" and "Folder (1)/", this option is needed to - make the paths sorted in the order you expect - ("Folder/" before "Folder (1)/"). - -f LOW HIGH, --filter LOW HIGH - Used for keeping only the entries that have a number - falling in the given range. - -F LOW HIGH, --reverse-filter LOW HIGH - Used for excluding the entries that have a number - falling in the given range. - -e EXCLUDE, --exclude EXCLUDE - Used to exclude an entry that contains a specific - number. - -r, --reverse Returns in reversed order. - -t {digit,int,float,version,ver,real,f,i,r,d}, - --number-type {digit,int,float,version,ver,real,f,i,r,d}, - --number_type {digit,int,float,version,ver,real,f,i,r,d} - Choose the type of number to search for. "float" will - search for floating-point numbers. "int" will only - search for integers. "digit", "version", and "ver" are - synonyms for "int"."real" is a shortcut for "float" - with --sign. "i" and "d" are synonyms for "int", "f" - is a synonym for "float", and "r" is a synonym for - "real".The default is int. - --nosign Do not consider "+" or "-" as part of a number, i.e. - do not take sign into consideration. This is the - default. - -s, --sign Consider "+" or "-" as part of a number, i.e. take - sign into consideration. The default is unsigned. - --noexp Do not consider an exponential as part of a number, - i.e. 1e4, would be considered as 1, "e", and 4, not as - 10000. This only effects the --number-type=float. - -l, --locale Causes natsort to use locale-aware sorting. You will - get the best results if you install PyICU. - -Description ------------ - -``natsort`` was originally written to aid in computational chemistry -research so that it would be easy to analyze large sets of output files -named after the parameter used:: - - $ ls *.out - mode1000.35.out mode1243.34.out mode744.43.out mode943.54.out - -(Obviously, in reality there would be more files, but you get the idea.) Notice -that the shell sorts in lexicographical order. This is the behavior of programs like -``find`` as well as ``ls``. The problem is passing these files to an -analysis program causes them not to appear in numerical order, which can lead -to bad analysis. To remedy this, use ``natsort``:: - - $ natsort *.out - mode744.43.out - mode943.54.out - mode1000.35.out - mode1243.34.out - $ natsort -t r *.out | xargs your_program - -``-t r`` is short for ``--number-type real``. You can also place natsort in -the middle of a pipe:: - - $ find . -name "*.out" | natsort -t r | xargs your_program - -To sort version numbers, use the default ``--number-type``:: - - $ ls * - prog-1.10.zip prog-1.9.zip prog-2.0.zip - $ natsort * - prog-1.9.zip - prog-1.10.zip - prog-2.0.zip - -In general, all ``natsort`` shell script options mirror the :func:`~natsorted` API, -with notable exception of the ``--filter``, ``--reverse-filter``, and ``--exclude`` -options. These three options are used as follows:: - - $ ls *.out - mode1000.35.out mode1243.34.out mode744.43.out mode943.54.out - $ natsort -t r *.out -f 900 1100 # Select only numbers between 900-1100 - mode943.54.out - mode1000.35.out - $ natsort -t r *.out -F 900 1100 # Select only numbers NOT between 900-1100 - mode744.43.out - mode1243.34.out - $ natsort -t r *.out -e 1000.35 # Exclude 1000.35 from search - mode744.43.out - mode943.54.out - mode1243.34.out - -If you are sorting paths with OS-generated filenames, you may require the -``--paths``/``-p`` option:: - - $ find . ! -path . -type f - ./folder/file (1).txt - ./folder/file.txt - ./folder (1)/file.txt - ./folder (10)/file.txt - ./folder (2)/file.txt - $ find . ! -path . -type f | natsort - ./folder (1)/file.txt - ./folder (2)/file.txt - ./folder (10)/file.txt - ./folder/file (1).txt - ./folder/file.txt - $ find . ! -path . -type f | natsort -p - ./folder/file.txt - ./folder/file (1).txt - ./folder (1)/file.txt - ./folder (2)/file.txt - ./folder (10)/file.txt - diff --git a/docs/source/special_cases_everywhere.jpg b/docs/source/special_cases_everywhere.jpg deleted file mode 100644 index a9e0e86..0000000 Binary files a/docs/source/special_cases_everywhere.jpg and /dev/null differ diff --git a/docs/source/versorted.rst b/docs/source/versorted.rst deleted file mode 100644 index 6f88597..0000000 --- a/docs/source/versorted.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. default-domain:: py -.. currentmodule:: natsort - -:func:`~natsort.versorted` -========================== - -.. autofunction:: versorted - diff --git a/docs/special_cases_everywhere.jpg b/docs/special_cases_everywhere.jpg new file mode 100644 index 0000000..a9e0e86 Binary files /dev/null and b/docs/special_cases_everywhere.jpg differ diff --git a/natsort/__init__.py b/natsort/__init__.py index 679fa90..da23650 100644 --- a/natsort/__init__.py +++ b/natsort/__init__.py @@ -11,31 +11,27 @@ index_humansorted, index_natsorted, index_realsorted, - index_versorted, natsort_key, natsort_keygen, natsorted, ns, order_by_index, realsorted, - versorted, ) from natsort.utils import chain_functions if float(sys.version[:3]) < 3: from natsort.natsort import natcmp -__version__ = "5.4.1" +__version__ = "6.0.0" __all__ = [ "natsort_key", "natsort_keygen", "natsorted", - "versorted", "humansorted", "realsorted", "index_natsorted", - "index_versorted", "index_humansorted", "index_realsorted", "order_by_index", @@ -48,5 +44,4 @@ ] # Add the ns keys to this namespace for convenience. -# A dict comprehension is not used for Python 2.6 compatibility. -globals().update(dict((k, getattr(ns, k)) for k in dir(ns) if k.isupper())) +globals().update(ns._asdict()) diff --git a/natsort/__main__.py b/natsort/__main__.py index c06c9b8..b52bf36 100644 --- a/natsort/__main__.py +++ b/natsort/__main__.py @@ -24,7 +24,7 @@ parser.add_argument( "--version", action="version", - version="%(prog)s {0}".format(natsort.__version__), + version="%(prog)s {}".format(natsort.__version__), ) parser.add_argument( "-p", @@ -78,13 +78,12 @@ "--number-type", "--number_type", dest="number_type", - choices=("digit", "int", "float", "version", "ver", "real", "f", "i", "r", "d"), + choices=("int", "float", "real", "f", "i", "r"), default="int", help='Choose the type of number to search for. "float" will search ' 'for floating-point numbers. "int" will only search for ' - 'integers. "digit", "version", and "ver" are synonyms for "int".' - '"real" is a shortcut for "float" with --sign. ' - '"i" and "d" are synonyms for "int", "f" is a synonym for ' + 'integers. "real" is a shortcut for "float" with --sign. ' + '"i" is a synonym for "int", "f" is a synonym for ' '"float", and "r" is a synonym for "real".' "The default is %(default)s.", ) diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py index 1629ba0..41abea6 100644 --- a/natsort/compat/locale.py +++ b/natsort/compat/locale.py @@ -7,9 +7,10 @@ # Std. lib imports. import sys +from functools import cmp_to_key # Local imports. -from natsort.compat.py23 import PY_VERSION, cmp_to_key, py23_unichr +from natsort.compat.py23 import PY_VERSION, py23_unichr # This string should be sorted after any other byte string because # it contains the max unicode character repeated 20 times. diff --git a/natsort/compat/py23.py b/natsort/compat/py23.py index ba9abd9..58f7487 100644 --- a/natsort/compat/py23.py +++ b/natsort/compat/py23.py @@ -56,43 +56,6 @@ py23_map = itertools.imap py23_filter = itertools.ifilter -# cmp_to_key was not created till 2.7, so require this for 2.6 -try: - from functools import cmp_to_key -except ImportError: # pragma: no cover - - def cmp_to_key(mycmp): - """Convert a cmp= function into a key= function""" - - class K(object): - __slots__ = ["obj"] - - def __init__(self, obj): - self.obj = obj - - def __lt__(self, other): - return mycmp(self.obj, other.obj) < 0 - - def __gt__(self, other): - return mycmp(self.obj, other.obj) > 0 - - def __eq__(self, other): - return mycmp(self.obj, other.obj) == 0 - - def __le__(self, other): - return mycmp(self.obj, other.obj) <= 0 - - def __ge__(self, other): - return mycmp(self.obj, other.obj) >= 0 - - def __ne__(self, other): - return mycmp(self.obj, other.obj) != 0 - - def __hash__(self): - raise TypeError("hash not implemented") - - return K - # This function is intended to decorate other functions that will modify # either a string directly, or a function's docstring. diff --git a/natsort/natsort.py b/natsort/natsort.py index 27f532d..e597815 100644 --- a/natsort/natsort.py +++ b/natsort/natsort.py @@ -14,7 +14,7 @@ import natsort.compat.locale from natsort import utils from natsort.compat.py23 import py23_cmp, py23_str, u_format -from natsort.ns_enum import ns, ns_DUMB +from natsort.ns_enum import NS_DUMB, ns @u_format @@ -108,7 +108,7 @@ @u_format -def natsort_keygen(key=None, alg=ns.DEFAULT, **_kwargs): +def natsort_keygen(key=None, alg=ns.DEFAULT): """ Generate a key to sort strings and numbers naturally. @@ -154,16 +154,15 @@ [{u}'num-3', {u}'num2', {u}'num5.10', {u}'num5.3'] """ - # Transform old arguments to the ns enum. try: - alg = utils.args_to_enum(**_kwargs) | alg + ns.DEFAULT | alg except TypeError: msg = "natsort_keygen: 'alg' argument must be from the enum 'ns'" - raise ValueError(msg + ", got {0}".format(py23_str(alg))) - - # Add the _DUMB option if the locale library is broken. + raise ValueError(msg + ", got {}".format(py23_str(alg))) + + # Add the NS_DUMB option if the locale library is broken. if alg & ns.LOCALEALPHA and natsort.compat.locale.dumb_sort(): - alg |= ns_DUMB + alg |= NS_DUMB # Set some variables that will be passed to the factory functions if alg & ns.NUMAFTER: @@ -220,7 +219,7 @@ @u_format -def natsorted(seq, key=None, reverse=False, alg=ns.DEFAULT, **_kwargs): +def natsorted(seq, key=None, reverse=False, alg=ns.DEFAULT): """ Sorts an iterable naturally. @@ -264,24 +263,8 @@ [{u}'num2', {u}'num3', {u}'num5'] """ - key = natsort_keygen(key, alg, **_kwargs) + key = natsort_keygen(key, alg) return sorted(seq, reverse=reverse, key=key) - - -@u_format -def versorted(seq, key=None, reverse=False, alg=ns.DEFAULT, **_kwargs): - """ - Identical to :func:`natsorted`. - - This function exists for backwards compatibility with `natsort` - version < 4.0.0. Future development should use :func:`natsorted`. - - See Also - -------- - natsorted - - """ - return natsorted(seq, key, reverse, alg, **_kwargs) @u_format @@ -392,7 +375,7 @@ @u_format -def index_natsorted(seq, key=None, reverse=False, alg=ns.DEFAULT, **_kwargs): +def index_natsorted(seq, key=None, reverse=False, alg=ns.DEFAULT): """ Determine the list of the indexes used to sort the input sequence. @@ -457,27 +440,8 @@ # Pair the index and sequence together, then sort by element index_seq_pair = [[x, y] for x, y in enumerate(seq)] - index_seq_pair.sort(reverse=reverse, key=natsort_keygen(newkey, alg, **_kwargs)) + index_seq_pair.sort(reverse=reverse, key=natsort_keygen(newkey, alg)) return [x for x, _ in index_seq_pair] - - -@u_format -def index_versorted(seq, key=None, reverse=False, alg=ns.DEFAULT, **_kwargs): - """ - Identical to :func:`index_natsorted`. - - This function exists for backwards compatibility with - ``index_natsort`` version < 4.0.0. Future development should use - :func:`index_natsorted`. - - Please see the :func:`index_natsorted` documentation for use. - - See Also - -------- - index_natsorted - - """ - return index_natsorted(seq, key, reverse, alg, **_kwargs) @u_format @@ -678,16 +642,16 @@ cached_keys = {} - def __new__(cls, x, y, alg=ns.DEFAULT, *args, **kwargs): + def __new__(cls, x, y, alg=ns.DEFAULT): try: - alg = utils.args_to_enum(**kwargs) | alg + ns.DEFAULT | alg except TypeError: - msg = "natsort_keygen: 'alg' argument must be " "from the enum 'ns'" - raise ValueError(msg + ", got {0}".format(py23_str(alg))) + msg = "natsort_keygen: 'alg' argument must be from the enum 'ns'" + raise ValueError(msg + ", got {}".format(py23_str(alg))) # Add the _DUMB option if the locale library is broken. if alg & ns.LOCALEALPHA and natsort.compat.locale.dumb_sort(): - alg |= ns_DUMB + alg |= NS_DUMB if alg not in cls.cached_keys: cls.cached_keys[alg] = natsort_keygen(alg=alg) diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py index be61295..6eebef6 100644 --- a/natsort/ns_enum.py +++ b/natsort/ns_enum.py @@ -6,8 +6,6 @@ from __future__ import absolute_import, division, print_function, unicode_literals import collections - -# NOTE: OrderedDict is not used below for compatibility with Python 2.6. # The below are the base ns options. The values will be stored as powers # of two so bitmasks can be used to extract the user's requested options. @@ -28,17 +26,14 @@ ] # Following were previously options but are now defaults. -enum_do_nothing = ["DEFAULT", "TYPESAFE", "INT", "VERSION", "DIGIT", "UNSIGNED"] +enum_do_nothing = ["DEFAULT", "INT", "UNSIGNED"] # The following are bitwise-OR combinations of other fields. enum_combos = [("REAL", ("FLOAT", "SIGNED")), ("LOCALE", ("LOCALEALPHA", "LOCALENUM"))] # The following are aliases for other fields. enum_aliases = [ - ("T", "TYPESAFE"), ("I", "INT"), - ("V", "VERSION"), - ("D", "DIGIT"), ("U", "UNSIGNED"), ("F", "FLOAT"), ("S", "SIGNED"), @@ -60,26 +55,25 @@ ] # Construct the list of bitwise distinct enums with their fields. -enum_fields = [(name, 1 << i) for i, name in enumerate(enum_options)] -enum_fields.extend((name, 0) for name in enum_do_nothing) +enum_fields = collections.OrderedDict( + (name, 1 << i) for i, name in enumerate(enum_options) +) +enum_fields.update((name, 0) for name in enum_do_nothing) for name, combo in enum_combos: - current_mapping = dict(enum_fields) - combined_value = current_mapping[combo[0]] + combined_value = enum_fields[combo[0]] for combo_name in combo[1:]: - combined_value |= current_mapping[combo_name] - enum_fields.append((name, combined_value)) + combined_value |= enum_fields[combo_name] + enum_fields[name] = combined_value -current_mapping = dict(enum_fields) -enum_fields.extend((alias, current_mapping[name]) for alias, name in enum_aliases) - -# Finally, extract out the enum field names and their values. -enum_field_names, enum_field_values = zip(*enum_fields) +enum_fields.update( + (alias, enum_fields[name]) for alias, name in enum_aliases +) # Subclass the namedtuple to improve the docstring. # noinspection PyUnresolvedReferences -class _NSEnum(collections.namedtuple("_NSEnum", enum_field_names)): +class _NSEnum(collections.namedtuple("_NSEnum", enum_fields.keys())): """ Enum to control the `natsort` algorithm. @@ -130,7 +124,7 @@ default "NFD". This will transform characters such as '⑦' into '7'. Please see https://stackoverflow.com/a/7934397/1399279, https://stackoverflow.com/a/7931547/1399279, - and http://unicode.org/reports/tr15/ for full details into unicode + and https://unicode.org/reports/tr15/ for full details into unicode normalization. LOCALE, L Tell `natsort` to be locale-aware when sorting. This includes both @@ -180,14 +174,6 @@ If an NaN shows up in the input, this instructs `natsort` to treat these as +Infinity and place them after all the other numbers. By default, an NaN be treated as -Infinity and be placed first. - TYPESAFE, T - Deprecated as of `natsort` version 5.0.0; this option is now - a no-op because it is always true. - VERSION, V - Deprecated as of `natsort` version 5.0.0; this option is now - a no-op because it is the default. - DIGIT, D - Same as `VERSION` above. Notes ----- @@ -205,7 +191,7 @@ # Here is where the instance of the ns enum that will be exported is created. # It is a poor-man's singleton. -ns = _NSEnum(*enum_field_values) +ns = _NSEnum(*enum_fields.values()) # The below is private for internal use only. -ns_DUMB = 1 << 31 +NS_DUMB = 1 << 31 diff --git a/natsort/unicode_numeric_hex.py b/natsort/unicode_numeric_hex.py index 56c69d6..de28f3b 100644 --- a/natsort/unicode_numeric_hex.py +++ b/natsort/unicode_numeric_hex.py @@ -1743,7 +1743,7 @@ a = py23_unichr(i) except ValueError: break - if a in set("0123456789"): + if a in "0123456789": continue if unicodedata.numeric(a, None) is not None: hex_chars.append(i) diff --git a/natsort/utils.py b/natsort/utils.py index 496904a..4db0f80 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -50,7 +50,6 @@ from os.path import split as path_split from os.path import splitext as path_splitext from unicodedata import normalize -from warnings import warn from natsort.compat.fastnumbers import fast_float, fast_int from natsort.compat.locale import get_decimal_point, get_strxfrm, get_thousands_sep @@ -63,7 +62,7 @@ py23_str, u_format, ) -from natsort.ns_enum import ns, ns_DUMB +from natsort.ns_enum import NS_DUMB, ns from natsort.unicode_numbers import digits_no_decimals, numeric_no_decimals if PY_VERSION >= 3: @@ -379,7 +378,7 @@ """ # Sometimes we store the "original" input before transformation, # sometimes after. - orig_after_xfrm = not (alg & ns_DUMB and alg & ns.LOCALEALPHA) + orig_after_xfrm = not (alg & NS_DUMB and alg & ns.LOCALEALPHA) original_func = input_transform if orig_after_xfrm else _no_op normalize_input = _normalize_input_factory(alg) @@ -492,7 +491,7 @@ """ # Shortcuts. lowfirst = alg & ns.LOWERCASEFIRST - dumb = alg & ns_DUMB + dumb = alg & NS_DUMB # Build the chain of functions to execute in order. function_chain = [] @@ -566,7 +565,7 @@ """ # Shortcuts. use_locale = alg & ns.LOCALEALPHA - dumb = alg & ns_DUMB + dumb = alg & NS_DUMB group_letters = (alg & ns.GROUPLETTERS) or (use_locale and dumb) nan_val = float("+inf") if alg & ns.NANLAST else float("-inf") @@ -614,7 +613,7 @@ """ if alg & ns.UNGROUPLETTERS and alg & ns.LOCALEALPHA: - swap = alg & ns_DUMB and alg & ns.LOWERCASEFIRST + swap = alg & NS_DUMB and alg & ns.LOWERCASEFIRST transform = methodcaller("swapcase") if swap else _no_op def func(split_val, val, _transform=transform, _sep=sep, _pre_sep=pre_sep): @@ -787,39 +786,3 @@ # Return the split parent paths and then the split basename. return ichain(path_parts, base_parts) - - -def args_to_enum(**kwargs): - """ - A function to convert input booleans to an enum-type argument. - - For internal use only - will be deprecated in a future release. - """ - alg = 0 - keys = ("number_type", "signed", "exp", "as_path", "py3_safe") - if any(x not in keys for x in kwargs): - x = set(kwargs) - set(keys) - raise TypeError("Invalid argument(s): " + ", ".join(x)) - if "number_type" in kwargs and kwargs["number_type"] is not int: - msg = "The 'number_type' argument is deprecated as of 3.5.0, " - msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'" - warn(msg, DeprecationWarning) - alg |= ns.FLOAT * bool(kwargs["number_type"] is float) - alg |= ns.INT * bool(kwargs["number_type"] in (int, None)) - alg |= ns.SIGNED * (kwargs["number_type"] not in (float, None)) - if "signed" in kwargs and kwargs["signed"] is not None: - msg = "The 'signed' argument is deprecated as of 3.5.0, " - msg += "please use 'alg=ns.SIGNED'." - warn(msg, DeprecationWarning) - alg |= ns.SIGNED * bool(kwargs["signed"]) - if "exp" in kwargs and kwargs["exp"] is not None: - msg = "The 'exp' argument is deprecated as of 3.5.0, " - msg += "please use 'alg=ns.NOEXP'." - warn(msg, DeprecationWarning) - alg |= ns.NOEXP * (not kwargs["exp"]) - if "as_path" in kwargs and kwargs["as_path"] is not None: - msg = "The 'as_path' argument is deprecated as of 3.5.0, " - msg += "please use 'alg=ns.PATH'." - warn(msg, DeprecationWarning) - alg |= ns.PATH * kwargs["as_path"] - return alg diff --git a/setup.cfg b/setup.cfg index 784a9a9..23a410f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 5.4.1 +current_version = 6.0.0 commit = True tag = True tag_name = {new_version} @@ -10,7 +10,9 @@ url = https://github.com/SethMMorton/natsort description = Simple yet flexible natural sorting in Python. long_description = file: README.rst +long_description_content_type = text/x-rst license = MIT +license_file = LICENSE classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Developers @@ -21,8 +23,8 @@ Operating System :: OS Independent License :: OSI Approved :: MIT License Natural Language :: English + Programming Language :: Python Programming Language :: Python :: 2 - Programming Language :: Python :: 2.6 Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 Programming Language :: Python :: 3.4 @@ -43,9 +45,9 @@ [bumpversion:file:natsort/__init__.py] -[bumpversion:file:docs/source/conf.py] +[bumpversion:file:docs/conf.py] -[bumpversion:file:docs/source/changelog.rst] +[bumpversion:file:CHANGELOG.rst] search = XX-XX-XXXX v. X.X.X replace = {now:%%m-%%d-%%Y} v. {new_version} diff --git a/setup.py b/setup.py index bef9357..68e06fc 100644 --- a/setup.py +++ b/setup.py @@ -3,12 +3,12 @@ from setuptools import find_packages, setup setup( name='natsort', - version='5.4.1', + version='6.0.0', packages=find_packages(), - install_requires=["argparse; python_version < '2.7'"], entry_points={'console_scripts': ['natsort = natsort.__main__:main']}, + python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", extras_require={ - 'fast': ["fastnumbers >= 2.0.0; python_version > '2.6'"], + 'fast': ["fastnumbers >= 2.0.0"], 'icu': ["PyICU >= 1.0.0"] } ) diff --git a/test_natsort/conftest.py b/test_natsort/conftest.py deleted file mode 100644 index 79a8aaa..0000000 --- a/test_natsort/conftest.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Fixtures for pytest. -""" - -import locale - -import pytest - - -def load_locale(x): - """Convenience to load a locale, trying ISO8859-1 first.""" - try: - locale.setlocale(locale.LC_ALL, str("{0}.ISO8859-1".format(x))) - except locale.Error: - locale.setlocale(locale.LC_ALL, str("{0}.UTF-8".format(x))) - - -@pytest.fixture() -def with_locale_en_us(): - """Convenience to load the en_US locale - reset when complete.""" - orig = locale.getlocale() - yield load_locale("en_US") - locale.setlocale(locale.LC_ALL, orig) - - -@pytest.fixture() -def with_locale_de_de(): - """ - Convenience to load the de_DE locale - reset when complete - skip if missing. - """ - orig = locale.getlocale() - try: - load_locale("de_DE") - except locale.Error: - pytest.skip("requires de_DE locale to be installed") - else: - yield - finally: - locale.setlocale(locale.LC_ALL, orig) diff --git a/test_natsort/profile_natsorted.py b/test_natsort/profile_natsorted.py deleted file mode 100644 index ec7037f..0000000 --- a/test_natsort/profile_natsorted.py +++ /dev/null @@ -1,70 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -This file contains functions to profile natsorted with different -inputs and different settings. -""" -from __future__ import print_function - -import cProfile -import locale -import sys - -try: - from natsort import ns, natsort_keygen - from natsort.compat.py23 import py23_range -except ImportError: - sys.path.insert(0, ".") - from natsort import ns, natsort_keygen - from natsort.compat.py23 import py23_range - -locale.setlocale(locale.LC_ALL, "en_US.UTF-8") - -# Samples to parse -number = 14695498 -int_string = "43493" -float_string = "-434.93e7" -plain_string = "hello world" -fancy_string = "7abba9342fdab" -a_path = "/p/Folder (1)/file (1).tar.gz" -some_bytes = b"these are bytes" -a_list = ["hello", "goodbye", "74"] - -basic_key = natsort_keygen() -real_key = natsort_keygen(alg=ns.REAL) -path_key = natsort_keygen(alg=ns.PATH) -locale_key = natsort_keygen(alg=ns.LOCALE) - - -def prof_time_to_generate(): - print("*** Generate Plain Key ***") - for _ in py23_range(100000): - natsort_keygen() - - -cProfile.run("prof_time_to_generate()", sort="time") - - -def prof_parsing(a, msg, key=basic_key): - print(msg) - for _ in py23_range(100000): - key(a) - - -cProfile.run( - 'prof_parsing(int_string, "*** Basic Call, Int as String ***")', sort="time" -) -cProfile.run( - 'prof_parsing(float_string, "*** Basic Call, Float as String ***")', sort="time" -) -cProfile.run('prof_parsing(float_string, "*** Real Call ***", real_key)', sort="time") -cProfile.run('prof_parsing(number, "*** Basic Call, Number ***")', sort="time") -cProfile.run( - 'prof_parsing(fancy_string, "*** Basic Call, Mixed String ***")', sort="time" -) -cProfile.run('prof_parsing(some_bytes, "*** Basic Call, Byte String ***")', sort="time") -cProfile.run('prof_parsing(a_path, "*** Path Call ***", path_key)', sort="time") -cProfile.run('prof_parsing(a_list, "*** Basic Call, Recursive ***")', sort="time") -cProfile.run( - 'prof_parsing("434,930,000 dollars", "*** Locale Call ***", locale_key)', - sort="time", -) diff --git a/test_natsort/test_fake_fastnumbers.py b/test_natsort/test_fake_fastnumbers.py deleted file mode 100644 index 1c0da66..0000000 --- a/test_natsort/test_fake_fastnumbers.py +++ /dev/null @@ -1,138 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -Test the fake fastnumbers module. -""" -from __future__ import unicode_literals - -import unicodedata -from math import isnan - -from hypothesis import given -from hypothesis.strategies import floats, integers, text -from natsort.compat.fake_fastnumbers import fast_float, fast_int -from natsort.compat.py23 import PY_VERSION - -if PY_VERSION >= 3: - long = int - - -def is_float(x): - try: - float(x) - except ValueError: - try: - unicodedata.numeric(x) - except (ValueError, TypeError): - return False - else: - return True - else: - return True - - -def not_a_float(x): - return not is_float(x) - - -def is_int(x): - try: - return x.is_integer() - except AttributeError: - try: - long(x) - except ValueError: - try: - unicodedata.digit(x) - except (ValueError, TypeError): - return False - else: - return True - else: - return True - - -def not_an_int(x): - return not is_int(x) - - -# Each test has an "example" version for demonstrative purposes, -# and a test that uses the hypothesis module. - - -def test_fast_float_returns_nan_alternate_if_nan_option_is_given(): - assert fast_float("nan", nan=7) == 7 - - -def test_fast_float_converts_float_string_to_float_example(): - assert fast_float("45.8") == 45.8 - assert fast_float("-45") == -45.0 - assert fast_float("45.8e-2", key=len) == 45.8e-2 - assert isnan(fast_float("nan")) - assert isnan(fast_float("+nan")) - assert isnan(fast_float("-NaN")) - assert fast_float("۱۲.۱۲") == 12.12 - assert fast_float("-۱۲.۱۲") == -12.12 - - -@given(floats(allow_nan=False)) -def test_fast_float_converts_float_string_to_float(x): - assert fast_float(repr(x)) == x - - -def test_fast_float_leaves_string_as_is_example(): - assert fast_float("invalid") == "invalid" - - -@given(text().filter(not_a_float).filter(bool)) -def test_fast_float_leaves_string_as_is(x): - assert fast_float(x) == x - - -def test_fast_float_with_key_applies_to_string_example(): - assert fast_float("invalid", key=len) == len("invalid") - - -@given(text().filter(not_a_float).filter(bool)) -def test_fast_float_with_key_applies_to_string(x): - assert fast_float(x, key=len) == len(x) - - -def test_fast_int_leaves_float_string_as_is_example(): - assert fast_int("45.8") == "45.8" - assert fast_int("nan") == "nan" - assert fast_int("inf") == "inf" - - -@given(floats().filter(not_an_int)) -def test_fast_int_leaves_float_string_as_is(x): - assert fast_int(repr(x)) == repr(x) - - -def test_fast_int_converts_int_string_to_int_example(): - assert fast_int("-45") == -45 - assert fast_int("+45") == 45 - assert fast_int("۱۲") == 12 - assert fast_int("-۱۲") == -12 - - -@given(integers()) -def test_fast_int_converts_int_string_to_int(x): - assert fast_int(repr(x)) == x - - -def test_fast_int_leaves_string_as_is_example(): - assert fast_int("invalid") == "invalid" - - -@given(text().filter(not_an_int).filter(bool)) -def test_fast_int_leaves_string_as_is(x): - assert fast_int(x) == x - - -def test_fast_int_with_key_applies_to_string_example(): - assert fast_int("invalid", key=len) == len("invalid") - - -@given(text().filter(not_an_int).filter(bool)) -def test_fast_int_with_key_applies_to_string(x): - assert fast_int(x, key=len) == len(x) diff --git a/test_natsort/test_final_data_transform_factory.py b/test_natsort/test_final_data_transform_factory.py deleted file mode 100644 index 11b361b..0000000 --- a/test_natsort/test_final_data_transform_factory.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -import pytest -from hypothesis import example, given -from hypothesis.strategies import floats, integers, text -from natsort.compat.py23 import py23_str -from natsort.ns_enum import ns, ns_DUMB -from natsort.utils import final_data_transform_factory - - -@pytest.mark.parametrize("alg", [ns.DEFAULT, ns.UNGROUPLETTERS, ns.LOCALE]) -@given(x=text(), y=floats(allow_nan=False, allow_infinity=False) | integers()) -@pytest.mark.usefixtures("with_locale_en_us") -def test_final_data_transform_factory_default(x, y, alg): - final_data_transform_func = final_data_transform_factory(alg, "", "::") - value = (x, y) - original_value = "".join(map(py23_str, value)) - result = final_data_transform_func(value, original_value) - assert result == value - - -@pytest.mark.parametrize( - "alg, func", - [ - (ns.UNGROUPLETTERS | ns.LOCALE, lambda x: x), - (ns.LOCALE | ns.UNGROUPLETTERS | ns_DUMB, lambda x: x), - (ns.LOCALE | ns.UNGROUPLETTERS | ns.LOWERCASEFIRST, lambda x: x), - ( - ns.LOCALE | ns.UNGROUPLETTERS | ns_DUMB | ns.LOWERCASEFIRST, - lambda x: x.swapcase(), - ), - ], -) -@given(x=text(), y=floats(allow_nan=False, allow_infinity=False) | integers()) -@example(x="İ", y=0) -@pytest.mark.usefixtures("with_locale_en_us") -def test_final_data_transform_factory_ungroup_and_locale(x, y, alg, func): - final_data_transform_func = final_data_transform_factory(alg, "", "::") - value = (x, y) - original_value = "".join(map(py23_str, value)) - result = final_data_transform_func(value, original_value) - if x: - expected = ((func(original_value[:1]),), value) - else: - expected = (("::",), value) - assert result == expected - - -def test_final_data_transform_factory_ungroup_and_locale_empty_tuple(): - final_data_transform_func = final_data_transform_factory(ns.UG | ns.L, "", "::") - assert final_data_transform_func((), "") == ((), ()) diff --git a/test_natsort/test_input_string_transform_factory.py b/test_natsort/test_input_string_transform_factory.py deleted file mode 100644 index 3d000bb..0000000 --- a/test_natsort/test_input_string_transform_factory.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -import pytest -from hypothesis import example, given -from hypothesis.strategies import integers, text -from natsort.compat.py23 import NEWPY -from natsort.ns_enum import ns, ns_DUMB -from natsort.utils import input_string_transform_factory - - -def lower(x): - """Call the appropriate lower method for the Python version.""" - if NEWPY: - return x.casefold() - else: - return x.lower() - - -def thousands_separated_int(n): - """Insert thousands separators in an int.""" - new_int = "" - for i, y in enumerate(reversed(n), 1): - new_int = y + new_int - # For every third digit, insert a thousands separator. - if i % 3 == 0 and i != len(n): - new_int = "," + new_int - return new_int - - -@given(text()) -def test_input_string_transform_factory_is_no_op_for_no_alg_options(x): - input_string_transform_func = input_string_transform_factory(ns.DEFAULT) - assert input_string_transform_func(x) is x - - -@pytest.mark.parametrize( - "alg, example_func", - [ - (ns.IGNORECASE, lower), - (ns_DUMB, lambda x: x.swapcase()), - (ns.LOWERCASEFIRST, lambda x: x.swapcase()), - (ns_DUMB | ns.LOWERCASEFIRST, lambda x: x), # No-op - (ns.IGNORECASE | ns.LOWERCASEFIRST, lambda x: lower(x.swapcase())), - ], -) -@given(x=text()) -def test_input_string_transform_factory(x, alg, example_func): - input_string_transform_func = input_string_transform_factory(alg) - assert input_string_transform_func(x) == example_func(x) - - -@example(12543642642534980) # 12,543,642,642,534,980 => 12543642642534980 -@given(x=integers(min_value=1000)) -@pytest.mark.usefixtures("with_locale_en_us") -def test_input_string_transform_factory_cleans_thousands(x): - int_str = str(x).rstrip("lL") - thousands_int_str = thousands_separated_int(int_str) - assert thousands_int_str.replace(",", "") != thousands_int_str - - input_string_transform_func = input_string_transform_factory(ns.LOCALE) - assert input_string_transform_func(thousands_int_str) == int_str - - # Using LOCALEALPHA does not affect numbers. - input_string_transform_func_no_op = input_string_transform_factory(ns.LOCALEALPHA) - assert input_string_transform_func_no_op(thousands_int_str) == thousands_int_str - - -# These might be too much to test with hypothesis. - - -@pytest.mark.parametrize( - "x, expected", - [ - ("12,543,642642.5345,34980", "12543,642642.5345,34980"), - ("12,59443,642,642.53,4534980", "12,59443,642642.53,4534980"), # No change - ("12543,642,642.5,34534980", "12543,642642.5,34534980"), - ], -) -@pytest.mark.usefixtures("with_locale_en_us") -def test_input_string_transform_factory_handles_us_locale(x, expected): - input_string_transform_func = input_string_transform_factory(ns.LOCALE) - assert input_string_transform_func(x) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.LOCALE, "1543,753"), # Does nothing without FLOAT - (ns.LOCALE | ns.FLOAT, "1543.753"), - (ns.LOCALEALPHA, "1543,753"), # LOCALEALPHA won't do anything, need LOCALENUM - ], -) -@pytest.mark.usefixtures("with_locale_de_de") -def test_input_string_transform_factory_handles_german_locale(alg, expected): - input_string_transform_func = input_string_transform_factory(alg) - assert input_string_transform_func("1543,753") == expected - - -@pytest.mark.usefixtures("with_locale_de_de") -def test_input_string_transform_factory_does_nothing_with_non_num_input(): - input_string_transform_func = input_string_transform_factory(ns.LOCALE | ns.FLOAT) - expected = "154s,t53" - assert input_string_transform_func("154s,t53") == expected diff --git a/test_natsort/test_main.py b/test_natsort/test_main.py deleted file mode 100644 index 4559b92..0000000 --- a/test_natsort/test_main.py +++ /dev/null @@ -1,223 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -Test the natsort command-line tool functions. -""" -from __future__ import print_function, unicode_literals - -import re -import sys - -import pytest -from hypothesis import given -from hypothesis.strategies import data, floats, integers, lists -from natsort.__main__ import ( - check_filters, - keep_entry_range, - keep_entry_value, - main, - range_check, - sort_and_print_entries, -) - - -def test_main_passes_default_arguments_with_no_command_line_options(mocker): - p = mocker.patch("natsort.__main__.sort_and_print_entries") - main("num-2", "num-6", "num-1") - args = p.call_args[0][1] - assert not args.paths - assert args.filter is None - assert args.reverse_filter is None - assert args.exclude is None - assert not args.reverse - assert args.number_type == "int" - assert not args.signed - assert args.exp - assert not args.locale - - -def test_main_passes_arguments_with_all_command_line_options(mocker): - arguments = ["--paths", "--reverse", "--locale"] - arguments.extend(["--filter", "4", "10"]) - arguments.extend(["--reverse-filter", "100", "110"]) - arguments.extend(["--number-type", "float"]) - arguments.extend(["--noexp", "--sign"]) - arguments.extend(["--exclude", "34"]) - arguments.extend(["--exclude", "35"]) - arguments.extend(["num-2", "num-6", "num-1"]) - p = mocker.patch("natsort.__main__.sort_and_print_entries") - main(*arguments) - args = p.call_args[0][1] - assert args.paths - assert args.filter == [(4.0, 10.0)] - assert args.reverse_filter == [(100.0, 110.0)] - assert args.exclude == [34, 35] - assert args.reverse - assert args.number_type == "float" - assert args.signed - assert not args.exp - assert args.locale - - -class Args: - """A dummy class to simulate the argparse Namespace object""" - - def __init__(self, filt, reverse_filter, exclude, as_path, reverse): - self.filter = filt - self.reverse_filter = reverse_filter - self.exclude = exclude - self.reverse = reverse - self.number_type = "float" - self.signed = True - self.exp = True - self.paths = as_path - self.locale = 0 - - -mock_print = "__builtin__.print" if sys.version[0] == "2" else "builtins.print" - -entries = [ - "tmp/a57/path2", - "tmp/a23/path1", - "tmp/a1/path1", - "tmp/a1 (1)/path1", - "tmp/a130/path1", - "tmp/a64/path1", - "tmp/a64/path2", -] - - -@pytest.mark.parametrize( - "options, order", - [ - # Defaults, all options false - # tmp/a1 (1)/path1 - # tmp/a1/path1 - # tmp/a23/path1 - # tmp/a57/path2 - # tmp/a64/path1 - # tmp/a64/path2 - # tmp/a130/path1 - ([None, None, False, False, False], [3, 2, 1, 0, 5, 6, 4]), - # Path option True - # tmp/a1/path1 - # tmp/a1 (1)/path1 - # tmp/a23/path1 - # tmp/a57/path2 - # tmp/a64/path1 - # tmp/a64/path2 - # tmp/a130/path1 - ([None, None, False, True, False], [2, 3, 1, 0, 5, 6, 4]), - # Filter option keeps only within range - # tmp/a23/path1 - # tmp/a57/path2 - # tmp/a64/path1 - # tmp/a64/path2 - ([[(20, 100)], None, False, False, False], [1, 0, 5, 6]), - # Reverse filter, exclude in range - # tmp/a1/path1 - # tmp/a1 (1)/path1 - # tmp/a130/path1 - ([None, [(20, 100)], False, True, False], [2, 3, 4]), - # Exclude given values with exclude list - # tmp/a1/path1 - # tmp/a1 (1)/path1 - # tmp/a57/path2 - # tmp/a64/path1 - # tmp/a64/path2 - ([None, None, [23, 130], True, False], [2, 3, 0, 5, 6]), - # Reverse order - # tmp/a130/path1 - # tmp/a64/path2 - # tmp/a64/path1 - # tmp/a57/path2 - # tmp/a23/path1 - # tmp/a1 (1)/path1 - # tmp/a1/path1 - ([None, None, False, True, True], reversed([2, 3, 1, 0, 5, 6, 4])), - ], -) -def test_sort_and_print_entries(options, order, mocker): - p = mocker.patch(mock_print) - sort_and_print_entries(entries, Args(*options)) - e = [mocker.call(entries[i]) for i in order] - p.assert_has_calls(e) - - -# Each test has an "example" version for demonstrative purposes, -# and a test that uses the hypothesis module. - - -def test_range_check_returns_range_as_is_but_with_floats_example(): - assert range_check(10, 11) == (10.0, 11.0) - assert range_check(6.4, 30) == (6.4, 30.0) - - -@given(x=floats(allow_nan=False, min_value=-1E8, max_value=1E8) | integers(), d=data()) -def test_range_check_returns_range_as_is_if_first_is_less_than_second(x, d): - # Pull data such that the first is less than the second. - if isinstance(x, float): - y = d.draw(floats(min_value=x + 1.0, max_value=1E9, allow_nan=False)) - else: - y = d.draw(integers(min_value=x + 1)) - assert range_check(x, y) == (x, y) - - -def test_range_check_raises_value_error_if_second_is_less_than_first_example(): - with pytest.raises(ValueError, match="low >= high"): - range_check(7, 2) - - -@given(x=floats(allow_nan=False), d=data()) -def test_range_check_raises_value_error_if_second_is_less_than_first(x, d): - # Pull data such that the first is greater than or equal to the second. - y = d.draw(floats(max_value=x, allow_nan=False)) - with pytest.raises(ValueError, match="low >= high"): - range_check(x, y) - - -def test_check_filters_returns_none_if_filter_evaluates_to_false(): - assert check_filters(()) is None - assert check_filters(False) is None - assert check_filters(None) is None - - -def test_check_filters_returns_input_as_is_if_filter_is_valid_example(): - assert check_filters([(6, 7)]) == [(6, 7)] - assert check_filters([(6, 7), (2, 8)]) == [(6, 7), (2, 8)] - - -@given(x=lists(integers(), min_size=1), d=data()) -def test_check_filters_returns_input_as_is_if_filter_is_valid(x, d): - # ensure y is element-wise greater than x - y = [d.draw(integers(min_value=val + 1)) for val in x] - assert check_filters(list(zip(x, y))) == [(i, j) for i, j in zip(x, y)] - - -def test_check_filters_raises_value_error_if_filter_is_invalid_example(): - with pytest.raises(ValueError, match="Error in --filter: low >= high"): - check_filters([(7, 2)]) - - -@given(x=lists(integers(), min_size=1), d=data()) -def test_check_filters_raises_value_error_if_filter_is_invalid(x, d): - # ensure y is element-wise less than or equal to x - y = [d.draw(integers(max_value=val)) for val in x] - with pytest.raises(ValueError, match="Error in --filter: low >= high"): - check_filters(list(zip(x, y))) - - -@pytest.mark.parametrize( - "lows, highs, truth", - # 1. Any portion is between the bounds => True. - # 2. Any portion is between any bounds => True. - # 3. No portion is between the bounds => False. - [([0], [100], True), ([1, 88], [20, 90], True), ([1], [20], False)], -) -def test_keep_entry_range(lows, highs, truth): - assert keep_entry_range("a56b23c89", lows, highs, int, re.compile(r"\d+")) is truth - - -# 1. Values not in entry => True. 2. Values in entry => False. -@pytest.mark.parametrize("values, truth", [([100, 45], True), ([23], False)]) -def test_keep_entry_value(values, truth): - assert keep_entry_value("a56b23c89", values, int, re.compile(r"\d+")) is truth diff --git a/test_natsort/test_natsort_cmp.py b/test_natsort/test_natsort_cmp.py deleted file mode 100644 index 41a252f..0000000 --- a/test_natsort/test_natsort_cmp.py +++ /dev/null @@ -1,83 +0,0 @@ -# -*- coding: utf-8 -*- -# pylint: disable=unused-variable -"""These test the natcmp() function. - -Note that these tests are only relevant for Python version < 3. -""" -from functools import partial - -import pytest -from hypothesis import given -from hypothesis.strategies import floats, integers, lists -from natsort import ns -from natsort.compat.py23 import PY_VERSION, py23_cmp - -if PY_VERSION < 3: - from natsort import natcmp - - -class Comparable(object): - """Stub class for testing natcmp functionality.""" - - def __init__(self, value): - self.value = value - - def __cmp__(self, other): - return natcmp(self.value, other.value) - - -@pytest.mark.skipif(PY_VERSION >= 3.0, reason="cmp() deprecated in Python 3") -class TestNatCmp: - - def test_classes_can_be_compared(self): - one = Comparable("1") - two = Comparable("2") - another_two = Comparable("2") - ten = Comparable("10") - assert ten > two == another_two > one - - def test_keys_are_being_cached(self, mocker): - natcmp.cached_keys = {} - assert len(natcmp.cached_keys) == 0 - natcmp(0, 0) - assert len(natcmp.cached_keys) == 1 - natcmp(0, 0) - assert len(natcmp.cached_keys) == 1 - - with mocker.patch("natsort.compat.locale.dumb_sort", return_value=False): - natcmp(0, 0, alg=ns.L) - assert len(natcmp.cached_keys) == 2 - natcmp(0, 0, alg=ns.L) - assert len(natcmp.cached_keys) == 2 - - with mocker.patch("natsort.compat.locale.dumb_sort", return_value=True): - natcmp(0, 0, alg=ns.L) - assert len(natcmp.cached_keys) == 3 - natcmp(0, 0, alg=ns.L) - assert len(natcmp.cached_keys) == 3 - - def test_illegal_algorithm_raises_error(self): - with pytest.raises(ValueError): - natcmp(0, 0, alg="Just random stuff") - - def test_classes_can_utilize_max_or_min(self): - comparables = [Comparable(i) for i in range(10)] - - assert max(comparables) == comparables[-1] - assert min(comparables) == comparables[0] - - @given(integers(), integers()) - def test_natcmp_works_the_same_for_integers_as_cmp(self, x, y): - assert py23_cmp(x, y) == natcmp(x, y) - - @given(floats(allow_nan=False), floats(allow_nan=False)) - def test_natcmp_works_the_same_for_floats_as_cmp(self, x, y): - assert py23_cmp(x, y) == natcmp(x, y) - - @given(lists(elements=integers())) - def test_sort_strings_with_numbers(self, a_list): - strings = [str(var) for var in a_list] - # noinspection PyArgumentList - natcmp_sorted = sorted(strings, cmp=partial(natcmp, alg=ns.SIGNED)) - - assert sorted(a_list) == [int(var) for var in natcmp_sorted] diff --git a/test_natsort/test_natsort_key.py b/test_natsort/test_natsort_key.py deleted file mode 100644 index e0c442e..0000000 --- a/test_natsort/test_natsort_key.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -import pytest -from hypothesis import given -from hypothesis.strategies import binary, floats, integers, lists, text -from natsort.compat.py23 import PY_VERSION, py23_str -from natsort.utils import natsort_key - -if PY_VERSION >= 3: - long = int - - -def str_func(x): - if isinstance(x, py23_str): - return x - else: - raise TypeError("Not a str!") - - -def fail(_): - raise AssertionError("This should never be reached!") - - -@given(floats(allow_nan=False) | integers()) -def test_natsort_key_with_numeric_input_takes_number_path(x): - assert natsort_key(x, None, str_func, fail, lambda y: y) is x - - -@pytest.mark.skipif(PY_VERSION < 3, reason="only valid on python3") -@given(binary().filter(bool)) -def test_natsort_key_with_bytes_input_takes_bytes_path(x): - assert natsort_key(x, None, str_func, lambda y: y, fail) is x - - -@given(text()) -def test_natsort_key_with_text_input_takes_string_path(x): - assert natsort_key(x, None, str_func, fail, fail) is x - - -@given(lists(elements=text(), min_size=1, max_size=10)) -def test_natsort_key_with_nested_input_takes_nested_path(x): - assert natsort_key(x, None, str_func, fail, fail) == tuple(x) - - -@given(text()) -def test_natsort_key_with_key_argument_applies_key_before_processing(x): - assert natsort_key(x, len, str_func, fail, lambda y: y) == len(x) diff --git a/test_natsort/test_natsort_keygen.py b/test_natsort/test_natsort_keygen.py deleted file mode 100644 index baa4c3e..0000000 --- a/test_natsort/test_natsort_keygen.py +++ /dev/null @@ -1,168 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -Here are a collection of examples of how this module can be used. -See the README or the natsort homepage for more details. -""" -from __future__ import print_function, unicode_literals - -import pytest -from natsort import natsort_key, natsort_keygen, natsorted, ns -from natsort.compat.locale import get_strxfrm, null_string_locale -from natsort.compat.py23 import PY_VERSION - - -@pytest.fixture -def arbitrary_input(): - return ["6A-5.034e+1", "/Folder (1)/Foo", 56.7] - - -@pytest.fixture -def bytes_input(): - return b"6A-5.034e+1" - - -def test_natsort_keygen_demonstration(): - original_list = ["a50", "a51.", "a50.31", "a50.4", "a5.034e1", "a50.300"] - copy_of_list = original_list[:] - original_list.sort(key=natsort_keygen(alg=ns.F)) - # natsorted uses the output of natsort_keygen under the hood. - assert original_list == natsorted(copy_of_list, alg=ns.F) - - -def test_natsort_key_public(): - assert natsort_key("a-5.034e2") == ("a-", 5, ".", 34, "e", 2) - - -def test_natsort_keygen_with_invalid_alg_input_raises_value_error(): - # Invalid arguments give the correct response - with pytest.raises(ValueError, match="'alg' argument"): - natsort_keygen(None, "1") - - -@pytest.mark.parametrize( - "alg, expected", - [(ns.DEFAULT, ("a-", 5, ".", 34, "e", 1)), (ns.FLOAT | ns.SIGNED, ("a", -50.34))], -) -def test_natsort_keygen_returns_natsort_key_that_parses_input(alg, expected): - ns_key = natsort_keygen(alg=alg) - assert ns_key("a-5.034e1") == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - ( - ns.DEFAULT, - (("", 6, "A-", 5, ".", 34, "e+", 1), ("/Folder (", 1, ")/Foo"), ("", 56.7)), - ), - ( - ns.IGNORECASE, - (("", 6, "a-", 5, ".", 34, "e+", 1), ("/folder (", 1, ")/foo"), ("", 56.7)), - ), - (ns.REAL, (("", 6.0, "A", -50.34), ("/Folder (", 1.0, ")/Foo"), ("", 56.7))), - ( - ns.LOWERCASEFIRST | ns.FLOAT | ns.NOEXP, - ( - ("", 6.0, "a-", 5.034, "E+", 1.0), - ("/fOLDER (", 1.0, ")/fOO"), - ("", 56.7), - ), - ), - ( - ns.PATH | ns.GROUPLETTERS, - ( - (("", 6, "aA--", 5, "..", 34, "ee++", 1),), - (("//",), ("fFoollddeerr ((", 1, "))"), ("fFoooo",)), - (("", 56.7),), - ), - ), - ], -) -def test_natsort_keygen_handles_arbitrary_input(arbitrary_input, alg, expected): - ns_key = natsort_keygen(alg=alg) - assert ns_key(arbitrary_input) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.DEFAULT, (b"6A-5.034e+1",)), - (ns.IGNORECASE, (b"6a-5.034e+1",)), - (ns.REAL, (b"6A-5.034e+1",)), - (ns.LOWERCASEFIRST | ns.FLOAT | ns.NOEXP, (b"6A-5.034e+1",)), - (ns.PATH | ns.GROUPLETTERS, ((b"6A-5.034e+1",),)), - ], -) -@pytest.mark.skipif(PY_VERSION < 3.0, reason="special bytes handling only on Python3") -def test_natsort_keygen_handles_bytes_input(bytes_input, alg, expected): - ns_key = natsort_keygen(alg=alg) - assert ns_key(bytes_input) == expected - - -@pytest.mark.parametrize( - "alg, expected, is_dumb", - [ - ( - ns.LOCALE, - ( - (null_string_locale, 6, "A-", 5, ".", 34, "e+", 1), - ("/Folder (", 1, ")/Foo"), - (null_string_locale, 56.7), - ), - False, - ), - ( - ns.LOCALE, - ( - (null_string_locale, 6, "aa--", 5, "..", 34, "eE++", 1), - ("//ffoOlLdDeErR ((", 1, "))//ffoOoO"), - (null_string_locale, 56.7), - ), - True, - ), - ( - ns.LOCALE | ns.CAPITALFIRST, - ( - (("",), (null_string_locale, 6, "A-", 5, ".", 34, "e+", 1)), - (("/",), ("/Folder (", 1, ")/Foo")), - (("",), (null_string_locale, 56.7)), - ), - False, - ), - ], -) -@pytest.mark.usefixtures("with_locale_en_us") -def test_natsort_keygen_with_locale(mocker, arbitrary_input, alg, expected, is_dumb): - # First, apply the correct strxfrm function to the string values. - strxfrm = get_strxfrm() - expected = [list(sub) for sub in expected] - try: - for i in (2, 4, 6): - expected[0][i] = strxfrm(expected[0][i]) - for i in (0, 2): - expected[1][i] = strxfrm(expected[1][i]) - expected = tuple(tuple(sub) for sub in expected) - except IndexError: # ns.LOCALE | ns.CAPITALFIRST - expected = [[list(subsub) for subsub in sub] for sub in expected] - for i in (2, 4, 6): - expected[0][1][i] = strxfrm(expected[0][1][i]) - for i in (0, 2): - expected[1][1][i] = strxfrm(expected[1][1][i]) - expected = tuple(tuple(tuple(subsub) for subsub in sub) for sub in expected) - - with mocker.patch("natsort.compat.locale.dumb_sort", return_value=is_dumb): - ns_key = natsort_keygen(alg=alg) - assert ns_key(arbitrary_input) == expected - - -@pytest.mark.parametrize( - "alg, is_dumb", - [(ns.LOCALE, False), (ns.LOCALE, True), (ns.LOCALE | ns.CAPITALFIRST, False)], -) -@pytest.mark.skipif(PY_VERSION < 3.0, reason="special bytes handling only on Python3") -@pytest.mark.usefixtures("with_locale_en_us") -def test_natsort_keygen_with_locale_bytes(mocker, bytes_input, alg, is_dumb): - expected = (b"6A-5.034e+1",) - with mocker.patch("natsort.compat.locale.dumb_sort", return_value=is_dumb): - ns_key = natsort_keygen(alg=alg) - assert ns_key(bytes_input) == expected diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py deleted file mode 100644 index 5a80772..0000000 --- a/test_natsort/test_natsorted.py +++ /dev/null @@ -1,299 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -Here are a collection of examples of how this module can be used. -See the README or the natsort homepage for more details. -""" -from __future__ import print_function, unicode_literals - -from operator import itemgetter - -import pytest -from natsort import as_utf8, natsorted, ns -from natsort.compat.py23 import PY_VERSION -from pytest import raises - - -@pytest.fixture -def float_list(): - return ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"] - - -@pytest.fixture -def fruit_list(): - return ["Apple", "corn", "Corn", "Banana", "apple", "banana"] - - -@pytest.fixture -def mixed_list(): - return ["Ä", "0", "ä", 3, "b", 1.5, "2", "Z"] - - -def test_natsorted_numbers_in_ascending_order(): - given = ["a2", "a5", "a9", "a1", "a4", "a10", "a6"] - expected = ["a1", "a2", "a4", "a5", "a6", "a9", "a10"] - assert natsorted(given) == expected - - -def test_natsorted_can_sort_as_signed_floats_with_exponents(float_list): - expected = ["a-50", "a50", "a50.300", "a50.31", "a5.034e1", "a50.4", "a51."] - assert natsorted(float_list, alg=ns.REAL) == expected - - -@pytest.mark.parametrize( - # UNSIGNED is default - "alg", - [ns.NOEXP | ns.FLOAT | ns.UNSIGNED, ns.NOEXP | ns.FLOAT], -) -def test_natsorted_can_sort_as_unsigned_and_ignore_exponents(float_list, alg): - expected = ["a5.034e1", "a50", "a50.300", "a50.31", "a50.4", "a51.", "a-50"] - assert natsorted(float_list, alg=alg) == expected - - -# INT, DIGIT, and VERSION are all equivalent. -@pytest.mark.parametrize("alg", [ns.DEFAULT, ns.INT, ns.DIGIT, ns.VERSION]) -def test_natsorted_can_sort_as_unsigned_ints_which_is_default(float_list, alg): - expected = ["a5.034e1", "a50", "a50.4", "a50.31", "a50.300", "a51.", "a-50"] - assert natsorted(float_list, alg=alg) == expected - - -def test_natsorted_can_sort_as_signed_ints(float_list): - expected = ["a-50", "a5.034e1", "a50", "a50.4", "a50.31", "a50.300", "a51."] - assert natsorted(float_list, alg=ns.SIGNED) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [(ns.UNSIGNED, ["a7", "a+2", "a-5"]), (ns.SIGNED, ["a-5", "a+2", "a7"])], -) -def test_natsorted_can_sort_with_or_without_accounting_for_sign(alg, expected): - given = ["a-5", "a7", "a+2"] - assert natsorted(given, alg=alg) == expected - - -@pytest.mark.parametrize("alg", [ns.DEFAULT, ns.VERSION]) -def test_natsorted_can_sort_as_version_numbers(alg): - given = ["1.9.9a", "1.11", "1.9.9b", "1.11.4", "1.10.1"] - expected = ["1.9.9a", "1.9.9b", "1.10.1", "1.11", "1.11.4"] - assert natsorted(given, alg=alg) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.DEFAULT, ["0", 1.5, "2", 3, "Ä", "Z", "ä", "b"]), - (ns.NUMAFTER, ["Ä", "Z", "ä", "b", "0", 1.5, "2", 3]), - ], -) -def test_natsorted_handles_mixed_types(mixed_list, alg, expected): - assert natsorted(mixed_list, alg=alg) == expected - - -@pytest.mark.parametrize( - "alg, expected, slc", - [ - (ns.DEFAULT, [float("nan"), 5, "25", 1E40], slice(1, None)), - (ns.NANLAST, [5, "25", 1E40, float("nan")], slice(None, 3)), - ], -) -def test_natsorted_handles_nan(alg, expected, slc): - given = ["25", 5, float("nan"), 1E40] - # The slice is because NaN != NaN - # noinspection PyUnresolvedReferences - assert natsorted(given, alg=alg)[slc] == expected[slc] - - -@pytest.mark.skipif(PY_VERSION < 3.0, reason="error is only raised on Python 3") -def test_natsorted_with_mixed_bytes_and_str_input_raises_type_error(): - with raises(TypeError, match="bytes"): - natsorted(["ä", b"b"]) - - # ...unless you use as_utf (or some other decoder). - assert natsorted(["ä", b"b"], key=as_utf8) == ["ä", b"b"] - - -def test_natsorted_raises_type_error_for_non_iterable_input(): - with raises(TypeError, match="'int' object is not iterable"): - natsorted(100) - - -def test_natsorted_recurses_into_nested_lists(): - given = [["a1", "a5"], ["a1", "a40"], ["a10", "a1"], ["a2", "a5"]] - expected = [["a1", "a5"], ["a1", "a40"], ["a2", "a5"], ["a10", "a1"]] - assert natsorted(given) == expected - - -def test_natsorted_applies_key_to_each_list_element_before_sorting_list(): - given = [("a", "num3"), ("b", "num5"), ("c", "num2")] - expected = [("c", "num2"), ("a", "num3"), ("b", "num5")] - assert natsorted(given, key=itemgetter(1)) == expected - - -def test_natsorted_returns_list_in_reversed_order_with_reverse_option(float_list): - expected = natsorted(float_list)[::-1] - assert natsorted(float_list, reverse=True) == expected - - -def test_natsorted_handles_filesystem_paths(): - given = [ - "/p/Folder (10)/file.tar.gz", - "/p/Folder/file.tar.gz", - "/p/Folder (1)/file (1).tar.gz", - "/p/Folder (1)/file.tar.gz", - ] - expected_correct = [ - "/p/Folder/file.tar.gz", - "/p/Folder (1)/file.tar.gz", - "/p/Folder (1)/file (1).tar.gz", - "/p/Folder (10)/file.tar.gz", - ] - expected_incorrect = [ - "/p/Folder (1)/file (1).tar.gz", - "/p/Folder (1)/file.tar.gz", - "/p/Folder (10)/file.tar.gz", - "/p/Folder/file.tar.gz", - ] - # Is incorrect by default. - assert natsorted(given) == expected_incorrect - # Need ns.PATH to make it correct. - assert natsorted(given, alg=ns.PATH) == expected_correct - - -def test_natsorted_handles_numbers_and_filesystem_paths_simultaneously(): - # You can sort paths and numbers, not that you'd want to - given = ["/Folder (9)/file.exe", 43] - expected = [43, "/Folder (9)/file.exe"] - assert natsorted(given, alg=ns.PATH) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.DEFAULT, ["Apple", "Banana", "Corn", "apple", "banana", "corn"]), - (ns.IGNORECASE, ["Apple", "apple", "Banana", "banana", "corn", "Corn"]), - (ns.LOWERCASEFIRST, ["apple", "banana", "corn", "Apple", "Banana", "Corn"]), - (ns.GROUPLETTERS, ["Apple", "apple", "Banana", "banana", "Corn", "corn"]), - (ns.G | ns.LF, ["apple", "Apple", "banana", "Banana", "corn", "Corn"]), - ], -) -def test_natsorted_supports_case_handling(alg, expected, fruit_list): - assert natsorted(fruit_list, alg=alg) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.DEFAULT, [("A5", "a6"), ("a3", "a1")]), - (ns.LOWERCASEFIRST, [("a3", "a1"), ("A5", "a6")]), - (ns.IGNORECASE, [("a3", "a1"), ("A5", "a6")]), - ], -) -def test_natsorted_supports_nested_case_handling(alg, expected): - given = [("A5", "a6"), ("a3", "a1")] - assert natsorted(given, alg=alg) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.DEFAULT, ["apple", "Apple", "banana", "Banana", "corn", "Corn"]), - (ns.CAPITALFIRST, ["Apple", "Banana", "Corn", "apple", "banana", "corn"]), - (ns.LOWERCASEFIRST, ["Apple", "apple", "Banana", "banana", "Corn", "corn"]), - (ns.C | ns.LF, ["apple", "banana", "corn", "Apple", "Banana", "Corn"]), - ], -) -@pytest.mark.usefixtures("with_locale_en_us") -def test_natsorted_can_sort_using_locale(fruit_list, alg, expected): - assert natsorted(fruit_list, alg=ns.LOCALE | alg) == expected - - -@pytest.mark.usefixtures("with_locale_en_us") -def test_natsorted_can_sort_locale_specific_numbers_en(): - given = ["c", "a5,467.86", "ä", "b", "a5367.86", "a5,6", "a5,50"] - expected = ["a5,6", "a5,50", "a5367.86", "a5,467.86", "ä", "b", "c"] - assert natsorted(given, alg=ns.LOCALE | ns.F) == expected - - -@pytest.mark.usefixtures("with_locale_de_de") -def test_natsorted_can_sort_locale_specific_numbers_de(): - given = ["c", "a5.467,86", "ä", "b", "a5367.86", "a5,6", "a5,50"] - expected = ["a5,50", "a5,6", "a5367.86", "a5.467,86", "ä", "b", "c"] - assert natsorted(given, alg=ns.LOCALE | ns.F) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.DEFAULT, ["0", 1.5, "2", 3, "ä", "Ä", "b", "Z"]), - (ns.NUMAFTER, ["ä", "Ä", "b", "Z", "0", 1.5, "2", 3]), - (ns.UNGROUPLETTERS, ["0", 1.5, "2", 3, "Ä", "Z", "ä", "b"]), - (ns.UG | ns.NA, ["Ä", "Z", "ä", "b", "0", 1.5, "2", 3]), - # Adding PATH changes nothing. - (ns.PATH, ["0", 1.5, "2", 3, "ä", "Ä", "b", "Z"]), - (ns.PATH | ns.NUMAFTER, ["ä", "Ä", "b", "Z", "0", 1.5, "2", 3]), - (ns.PATH | ns.UNGROUPLETTERS, ["0", 1.5, "2", 3, "Ä", "Z", "ä", "b"]), - (ns.PATH | ns.UG | ns.NA, ["Ä", "Z", "ä", "b", "0", 1.5, "2", 3]), - ], -) -@pytest.mark.usefixtures("with_locale_en_us") -def test_natsorted_handles_mixed_types_with_locale(mixed_list, alg, expected): - assert natsorted(mixed_list, alg=ns.LOCALE | alg) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.DEFAULT, ["73", "5039", "Banana", "apple", "corn", "~~~~~~"]), - (ns.NUMAFTER, ["Banana", "apple", "corn", "~~~~~~", "73", "5039"]), - ], -) -def test_natsorted_sorts_an_odd_collection_of_strings(alg, expected): - given = ["apple", "Banana", "73", "5039", "corn", "~~~~~~"] - assert natsorted(given, alg=alg) == expected - - -def test_natsorted_sorts_mixed_ascii_and_non_ascii_numbers(): - given = [ - "1st street", - "10th street", - "2nd street", - "2 street", - "1 street", - "1street", - "11 street", - "street 2", - "street 1", - "Street 11", - "۲ street", - "۱ street", - "۱street", - "۱۲street", - "۱۱ street", - "street ۲", - "street ۱", - "street ۱", - "street ۱۲", - "street ۱۱", - ] - expected = [ - "1 street", - "۱ street", - "1st street", - "1street", - "۱street", - "2 street", - "۲ street", - "2nd street", - "10th street", - "11 street", - "۱۱ street", - "۱۲street", - "street 1", - "street ۱", - "street ۱", - "street 2", - "street ۲", - "Street 11", - "street ۱۱", - "street ۱۲", - ] - assert natsorted(given, alg=ns.IGNORECASE) == expected diff --git a/test_natsort/test_natsorted_convenience.py b/test_natsort/test_natsorted_convenience.py deleted file mode 100644 index 70fcf79..0000000 --- a/test_natsort/test_natsorted_convenience.py +++ /dev/null @@ -1,129 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -Here are a collection of examples of how this module can be used. -See the README or the natsort homepage for more details. -""" -from __future__ import print_function, unicode_literals - -from operator import itemgetter - -import pytest -from natsort import ( - as_ascii, - as_utf8, - decoder, - humansorted, - index_humansorted, - index_natsorted, - index_realsorted, - index_versorted, - natsorted, - ns, - order_by_index, - realsorted, - versorted, -) -from natsort.compat.py23 import PY_VERSION - - -@pytest.fixture -def version_list(): - return ["1.9.9a", "1.11", "1.9.9b", "1.11.4", "1.10.1"] - - -@pytest.fixture -def float_list(): - return ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"] - - -@pytest.fixture -def fruit_list(): - return ["Apple", "corn", "Corn", "Banana", "apple", "banana"] - - -def test_decoder_returns_function_that_can_decode_bytes_but_return_non_bytes_as_is(): - func = decoder("latin1") - str_obj = "bytes" - int_obj = 14 - assert func(b"bytes") == str_obj - assert func(int_obj) is int_obj # returns as-is, same object ID - if PY_VERSION >= 3: - assert ( - func(str_obj) is str_obj - ) # same object returned on Python3 b/c only bytes has decode - else: - assert func(str_obj) is not str_obj - assert ( - func(str_obj) == str_obj - ) # not same object on Python2 because str can decode - - -def test_as_ascii_converts_bytes_to_ascii(): - assert decoder("ascii")(b"bytes") == as_ascii(b"bytes") - - -def test_as_utf8_converts_bytes_to_utf8(): - assert decoder("utf8")(b"bytes") == as_utf8(b"bytes") - - -def test_versorted_is_identical_to_natsorted(version_list): - # versorted is retained for backwards compatibility - assert versorted(version_list) == natsorted(version_list) - - -def test_realsorted_is_identical_to_natsorted_with_real_alg(float_list): - assert realsorted(float_list) == natsorted(float_list, alg=ns.REAL) - - -@pytest.mark.usefixtures("with_locale_en_us") -def test_humansorted_is_identical_to_natsorted_with_locale_alg(fruit_list): - assert humansorted(fruit_list) == natsorted(fruit_list, alg=ns.LOCALE) - - -def test_index_natsorted_returns_integer_list_of_sort_order_for_input_list(): - given = ["num3", "num5", "num2"] - other = ["foo", "bar", "baz"] - index = index_natsorted(given) - assert index == [2, 0, 1] - assert [given[i] for i in index] == ["num2", "num3", "num5"] - assert [other[i] for i in index] == ["baz", "foo", "bar"] - - -def test_index_natsorted_reverse(): - given = ["num3", "num5", "num2"] - assert index_natsorted(given, reverse=True) == index_natsorted(given)[::-1] - - -def test_index_natsorted_applies_key_function_before_sorting(): - given = [("a", "num3"), ("b", "num5"), ("c", "num2")] - expected = [2, 0, 1] - assert index_natsorted(given, key=itemgetter(1)) == expected - - -def test_index_versorted_is_identical_to_index_natsorted(version_list): - # index_versorted is retained for backwards compatibility - assert index_versorted(version_list) == index_natsorted(version_list) - - -def test_index_realsorted_is_identical_to_index_natsorted_with_real_alg(float_list): - assert index_realsorted(float_list) == index_natsorted(float_list, alg=ns.REAL) - - -@pytest.mark.usefixtures("with_locale_en_us") -def test_index_humansorted_is_identical_to_index_natsorted_with_locale_alg(fruit_list): - assert index_humansorted(fruit_list) == index_natsorted(fruit_list, alg=ns.LOCALE) - - -def test_order_by_index_sorts_list_according_to_order_of_integer_list(): - given = ["num3", "num5", "num2"] - index = [2, 0, 1] - expected = [given[i] for i in index] - assert expected == ["num2", "num3", "num5"] - assert order_by_index(given, index) == expected - - -def test_order_by_index_returns_generator_with_iter_true(): - given = ["num3", "num5", "num2"] - index = [2, 0, 1] - assert order_by_index(given, index, True) != [given[i] for i in index] - assert list(order_by_index(given, index, True)) == [given[i] for i in index] diff --git a/test_natsort/test_parse_bytes_function.py b/test_natsort/test_parse_bytes_function.py deleted file mode 100644 index 49f54ae..0000000 --- a/test_natsort/test_parse_bytes_function.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -import pytest -from hypothesis import given -from hypothesis.strategies import binary -from natsort.ns_enum import ns -from natsort.utils import parse_bytes_factory - - -@pytest.mark.parametrize( - "alg, example_func", - [ - (ns.DEFAULT, lambda x: (x,)), - (ns.IGNORECASE, lambda x: (x.lower(),)), - # With PATH, it becomes a tested tuple. - (ns.PATH, lambda x: ((x,),)), - (ns.PATH | ns.IGNORECASE, lambda x: ((x.lower(),),)), - ], -) -@given(x=binary()) -def test_parse_bytest_factory_makes_function_that_returns_tuple(x, alg, example_func): - parse_bytes_func = parse_bytes_factory(alg) - assert parse_bytes_func(x) == example_func(x) diff --git a/test_natsort/test_parse_number_function.py b/test_natsort/test_parse_number_function.py deleted file mode 100644 index 7f22ef4..0000000 --- a/test_natsort/test_parse_number_function.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -import pytest -from hypothesis import given -from hypothesis.strategies import floats, integers -from natsort.ns_enum import ns -from natsort.utils import parse_number_factory - - -@pytest.mark.usefixtures("with_locale_en_us") -@pytest.mark.parametrize( - "alg, example_func", - [ - (ns.DEFAULT, lambda x: ("", x)), - (ns.PATH, lambda x: (("", x),)), - (ns.UNGROUPLETTERS | ns.LOCALE, lambda x: (("xx",), ("", x))), - (ns.PATH | ns.UNGROUPLETTERS | ns.LOCALE, lambda x: ((("xx",), ("", x)),)), - ], -) -@given(x=floats(allow_nan=False) | integers()) -def test_parse_number_factory_makes_function_that_returns_tuple(x, alg, example_func): - parse_number_func = parse_number_factory(alg, "", "xx") - assert parse_number_func(x) == example_func(x) - - -@pytest.mark.parametrize( - "alg, x, result", - [ - (ns.DEFAULT, 57, ("", 57)), - (ns.DEFAULT, float("nan"), ("", float("-inf"))), # NaN transformed to -infinity - (ns.NANLAST, float("nan"), ("", float("+inf"))), # NANLAST makes it +infinity - ], -) -def test_parse_number_factory_treats_nan_special(alg, x, result): - parse_number_func = parse_number_factory(alg, "", "xx") - assert parse_number_func(x) == result diff --git a/test_natsort/test_parse_string_function.py b/test_natsort/test_parse_string_function.py deleted file mode 100644 index c4565be..0000000 --- a/test_natsort/test_parse_string_function.py +++ /dev/null @@ -1,93 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -import unicodedata - -import pytest -from hypothesis import given -from hypothesis.strategies import floats, integers, lists, text -from natsort.compat.fastnumbers import fast_float -from natsort.compat.py23 import py23_str -from natsort.ns_enum import ns, ns_DUMB -from natsort.utils import NumericalRegularExpressions as NumRegex -from natsort.utils import parse_string_factory - - -class CustomTuple(tuple): - """Used to ensure what is given during testing is what is returned.""" - - original = None - - -def input_transform(x): - """Make uppercase.""" - try: - return x.upper() - except AttributeError: - return x - - -def final_transform(x, original): - """Make the input a CustomTuple.""" - t = CustomTuple(x) - t.original = original - return t - - -@pytest.fixture -def parse_string_func(request): - """A parse_string_factory result with sample arguments.""" - sep = "" - return parse_string_factory( - request.param, # algorirhm - sep, - NumRegex.int_nosign().split, - input_transform, - fast_float, - final_transform, - ) - - -@pytest.mark.parametrize("parse_string_func", [ns.DEFAULT], indirect=True) -@given(x=floats() | integers()) -def test_parse_string_factory_raises_type_error_if_given_number(x, parse_string_func): - with pytest.raises(TypeError): - assert parse_string_func(x) - - -# noinspection PyCallingNonCallable -@pytest.mark.parametrize( - "parse_string_func, orig_func", - [ - (ns.DEFAULT, lambda x: x.upper()), - (ns.LOCALE, lambda x: x.upper()), - (ns.LOCALE | ns_DUMB, lambda x: x), # This changes the "original" handling. - ], - indirect=["parse_string_func"], -) -@given( - x=lists( - elements=floats(allow_nan=False) | text() | integers(), min_size=1, max_size=10 - ) -) -@pytest.mark.usefixtures("with_locale_en_us") -def test_parse_string_factory_invariance(x, parse_string_func, orig_func): - # parse_string_factory is the high-level combination of several dedicated - # functions involved in splitting and manipulating a string. The details of - # what those functions do is not relevant to testing parse_string_factory. - # What is relevant is that the form of the output matches the invariant - # that even elements are string and odd are numerical. That each component - # function is doing what it should is tested elsewhere. - value = "".join(map(py23_str, x)) # Convert the input to a single string. - result = parse_string_func(value) - result_types = list(map(type, result)) - expected_types = [py23_str if i % 2 == 0 else float for i in range(len(result))] - assert result_types == expected_types - - # The result is in our CustomTuple. - assert isinstance(result, CustomTuple) - - # Original should have gone through the "input_transform" - # which is uppercase in these tests. - assert result.original == orig_func(unicodedata.normalize("NFD", value)) diff --git a/test_natsort/test_regex.py b/test_natsort/test_regex.py deleted file mode 100644 index d3fe617..0000000 --- a/test_natsort/test_regex.py +++ /dev/null @@ -1,100 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the splitting regular expressions.""" -from __future__ import unicode_literals - -import pytest -from natsort.utils import NumericalRegularExpressions as NumRegex - - -regex_names = { - NumRegex.int_nosign(): "int_nosign", - NumRegex.int_sign(): "int_sign", - NumRegex.float_nosign_noexp(): "float_nosign_noexp", - NumRegex.float_sign_noexp(): "float_sign_noexp", - NumRegex.float_nosign_exp(): "float_nosign_exp", - NumRegex.float_sign_exp(): "float_sign_exp", -} - -# Regex Aliases (so lines stay a reasonable length. -i_u = NumRegex.int_nosign() -i_s = NumRegex.int_sign() -f_u = NumRegex.float_nosign_noexp() -f_s = NumRegex.float_sign_noexp() -f_ue = NumRegex.float_nosign_exp() -f_se = NumRegex.float_sign_exp() - -# Assemble a test suite of regular strings and their regular expression -# splitting result. Organize by the input string. -regex_tests = { - "-123.45e+67": { - i_u: ["-", "123", ".", "45", "e+", "67", ""], - i_s: ["", "-123", ".", "45", "e", "+67", ""], - f_u: ["-", "123.45", "e+", "67", ""], - f_s: ["", "-123.45", "e", "+67", ""], - f_ue: ["-", "123.45e+67", ""], - f_se: ["", "-123.45e+67", ""], - }, - "a-123.45e+67b": { - i_u: ["a-", "123", ".", "45", "e+", "67", "b"], - i_s: ["a", "-123", ".", "45", "e", "+67", "b"], - f_u: ["a-", "123.45", "e+", "67", "b"], - f_s: ["a", "-123.45", "e", "+67", "b"], - f_ue: ["a-", "123.45e+67", "b"], - f_se: ["a", "-123.45e+67", "b"], - }, - "hello": { - i_u: ["hello"], - i_s: ["hello"], - f_u: ["hello"], - f_s: ["hello"], - f_ue: ["hello"], - f_se: ["hello"], - }, - "abc12.34.56-7def": { - i_u: ["abc", "12", ".", "34", ".", "56", "-", "7", "def"], - i_s: ["abc", "12", ".", "34", ".", "56", "", "-7", "def"], - f_u: ["abc", "12.34", "", ".56", "-", "7", "def"], - f_s: ["abc", "12.34", "", ".56", "", "-7", "def"], - f_ue: ["abc", "12.34", "", ".56", "-", "7", "def"], - f_se: ["abc", "12.34", "", ".56", "", "-7", "def"], - }, - "a1b2c3d4e5e6": { - i_u: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], - i_s: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], - f_u: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], - f_s: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], - f_ue: ["a", "1", "b", "2", "c", "3", "d", "4e5", "e", "6", ""], - f_se: ["a", "1", "b", "2", "c", "3", "d", "4e5", "e", "6", ""], - }, - "eleven۱۱eleven11eleven১১": { # All of these are the decimal 11 - i_u: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], - i_s: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], - f_u: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], - f_s: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], - f_ue: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], - f_se: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], - }, - "12①②ⅠⅡ⅓": { # Two decimals, Two digits, Two numerals, fraction - i_u: ["", "12", "", "①", "", "②", "ⅠⅡ⅓"], - i_s: ["", "12", "", "①", "", "②", "ⅠⅡ⅓"], - f_u: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], - f_s: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], - f_ue: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], - f_se: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], - } -} - - -# From the above collections, create the parametrized tests and labels. -regex_params = [ - (given, expected, regex) - for given, values in regex_tests.items() - for regex, expected in values.items() -] -labels = ["{}-{}".format(given, regex_names[regex]) for given, _, regex in regex_params] - - -@pytest.mark.parametrize("x, expected, regex", regex_params, ids=labels) -def test_regex_splits_correctly(x, expected, regex): - # noinspection PyUnresolvedReferences - assert regex.split(x) == expected diff --git a/test_natsort/test_string_component_transform_factory.py b/test_natsort/test_string_component_transform_factory.py deleted file mode 100644 index d52ae01..0000000 --- a/test_natsort/test_string_component_transform_factory.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -from functools import partial - -import pytest -from hypothesis import example, given -from hypothesis.strategies import floats, integers, text -from natsort.compat.fastnumbers import fast_float, fast_int -from natsort.compat.locale import get_strxfrm -from natsort.compat.py23 import py23_range, py23_str, py23_unichr -from natsort.ns_enum import ns, ns_DUMB -from natsort.utils import groupletters, string_component_transform_factory - -# There are some unicode values that are known failures with the builtin locale -# library on BSD systems that has nothing to do with natsort (a ValueError is -# raised by strxfrm). Let's filter them out. -try: - bad_uni_chars = frozenset( - py23_unichr(x) for x in py23_range(0X10fefd, 0X10ffff + 1) - ) -except ValueError: - # Narrow unicode build... no worries. - bad_uni_chars = frozenset() - - -def no_bad_uni_chars(x, _bad_chars=bad_uni_chars): - """Ensure text does not contain bad unicode characters""" - return not any(y in _bad_chars for y in x) - - -def no_null(x): - """Ensure text does not contain a null character.""" - return "\0" not in x - - -@pytest.mark.parametrize( - "alg, example_func", - [ - (ns.INT, fast_int), - (ns.DEFAULT, fast_int), - (ns.FLOAT, partial(fast_float, nan=float("-inf"))), - (ns.FLOAT | ns.NANLAST, partial(fast_float, nan=float("+inf"))), - (ns.GROUPLETTERS, partial(fast_int, key=groupletters)), - (ns.LOCALE, partial(fast_int, key=lambda x: get_strxfrm()(x))), - ( - ns.GROUPLETTERS | ns.LOCALE, - partial(fast_int, key=lambda x: get_strxfrm()(groupletters(x))), - ), - ( - ns_DUMB | ns.LOCALE, - partial(fast_int, key=lambda x: get_strxfrm()(groupletters(x))), - ), - ( - ns.GROUPLETTERS | ns.LOCALE | ns.FLOAT | ns.NANLAST, - partial( - fast_float, - key=lambda x: get_strxfrm()(groupletters(x)), - nan=float("+inf"), - ), - ), - ], -) -@example(x=float("nan")) -@given( - x=integers() - | floats() - | text().filter(bool).filter(no_bad_uni_chars).filter(no_null) -) -@pytest.mark.usefixtures("with_locale_en_us") -def test_string_component_transform_factory(x, alg, example_func): - string_component_transform_func = string_component_transform_factory(alg) - try: - assert string_component_transform_func(py23_str(x)) == example_func(py23_str(x)) - except ValueError as e: # handle broken locale lib on BSD. - if "is not in range" not in str(e): - raise diff --git a/test_natsort/test_unicode_numbers.py b/test_natsort/test_unicode_numbers.py deleted file mode 100644 index 484fbb2..0000000 --- a/test_natsort/test_unicode_numbers.py +++ /dev/null @@ -1,70 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -Test the Unicode numbers module. -""" -from __future__ import unicode_literals - -import unicodedata - -from natsort.compat.py23 import py23_range, py23_unichr -from natsort.unicode_numbers import ( - decimal_chars, - decimals, - digit_chars, - digits, - digits_no_decimals, - numeric, - numeric_chars, - numeric_hex, - numeric_no_decimals, -) - - -def test_numeric_chars_contains_only_valid_unicode_numeric_characters(): - for a in numeric_chars: - assert unicodedata.numeric(a, None) is not None - - -def test_digit_chars_contains_only_valid_unicode_digit_characters(): - for a in digit_chars: - assert unicodedata.digit(a, None) is not None - - -def test_decimal_chars_contains_only_valid_unicode_decimal_characters(): - for a in decimal_chars: - assert unicodedata.decimal(a, None) is not None - - -def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters(): - set_numeric_hex = set(numeric_hex) - set_numeric_chars = set(numeric_chars) - set_digit_chars = set(digit_chars) - set_decimal_chars = set(decimal_chars) - for i in py23_range(0X110000): - try: - a = py23_unichr(i) - except ValueError: - break - if a in set("0123456789"): - continue - if unicodedata.numeric(a, None) is not None: - assert i in set_numeric_hex - assert a in set_numeric_chars - if unicodedata.digit(a, None) is not None: - assert i in set_numeric_hex - assert a in set_digit_chars - if unicodedata.decimal(a, None) is not None: - assert i in set_numeric_hex - assert a in set_decimal_chars - - assert set_decimal_chars.isdisjoint(digits_no_decimals) - assert set_digit_chars.issuperset(digits_no_decimals) - - assert set_decimal_chars.isdisjoint(numeric_no_decimals) - assert set_numeric_chars.issuperset(numeric_no_decimals) - - -def test_combined_string_contains_all_characters_in_list(): - assert numeric == "".join(numeric_chars) - assert digits == "".join(digit_chars) - assert decimals == "".join(decimal_chars) diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py deleted file mode 100644 index 663a682..0000000 --- a/test_natsort/test_utils.py +++ /dev/null @@ -1,197 +0,0 @@ -# -*- coding: utf-8 -*- -"""These test the utils.py functions.""" -from __future__ import unicode_literals - -import pathlib -import string -from itertools import chain -from operator import neg as op_neg - -import pytest -from hypothesis import given -from hypothesis.strategies import integers, lists, sampled_from, text -from natsort import utils -from natsort.compat.py23 import py23_cmp, py23_int, py23_lower, py23_str -from natsort.ns_enum import ns - - -def test_do_decoding_decodes_bytes_string_to_unicode(): - assert type(utils.do_decoding(b"bytes", "ascii")) is py23_str - assert utils.do_decoding(b"bytes", "ascii") == "bytes" - assert utils.do_decoding(b"bytes", "ascii") == b"bytes".decode("ascii") - - -def test_args_to_enum_raises_typeerror_for_invalid_argument(): - with pytest.raises(TypeError): - utils.args_to_enum(**{"alf": 0}) - - -@pytest.mark.parametrize( - "kwargs, expected", - [ - ({"number_type": float, "signed": True, "exp": True}, ns.F | ns.S), - ({"number_type": float, "signed": True, "exp": False}, ns.F | ns.N | ns.S), - ({"number_type": float, "signed": False, "exp": True}, ns.F | ns.U), - ({"number_type": float, "signed": False, "exp": True}, ns.F), - ({"number_type": float, "signed": False, "exp": False}, ns.F | ns.U | ns.N), - ({"number_type": float, "as_path": True}, ns.F | ns.P), - ({"number_type": int, "as_path": True}, ns.I | ns.P), - ({"number_type": int, "signed": False}, ns.I | ns.U), - ({"number_type": None, "exp": True}, ns.I | ns.U), - ], -) -def test_args_to_enum(kwargs, expected): - with pytest.warns(DeprecationWarning): - assert utils.args_to_enum(**kwargs) == expected - - -@pytest.mark.parametrize( - "alg, expected", - [ - (ns.I, utils.NumericalRegularExpressions.int_nosign()), - (ns.I | ns.N, utils.NumericalRegularExpressions.int_nosign()), - (ns.I | ns.S, utils.NumericalRegularExpressions.int_sign()), - (ns.I | ns.S | ns.N, utils.NumericalRegularExpressions.int_sign()), - (ns.F, utils.NumericalRegularExpressions.float_nosign_exp()), - (ns.F | ns.N, utils.NumericalRegularExpressions.float_nosign_noexp()), - (ns.F | ns.S, utils.NumericalRegularExpressions.float_sign_exp()), - (ns.F | ns.S | ns.N, utils.NumericalRegularExpressions.float_sign_noexp()), - ], -) -def test_regex_chooser_returns_correct_regular_expression_object(alg, expected): - assert utils.regex_chooser(alg).pattern == expected.pattern - - -@pytest.mark.parametrize( - "alg, value_or_alias", - [ - # Defaults - (ns.DEFAULT, 0), - (ns.TYPESAFE, 0), - (ns.INT, 0), - (ns.VERSION, 0), - (ns.DIGIT, 0), - (ns.UNSIGNED, 0), - # Aliases - (ns.TYPESAFE, ns.T), - (ns.INT, ns.I), - (ns.VERSION, ns.V), - (ns.DIGIT, ns.D), - (ns.UNSIGNED, ns.U), - (ns.FLOAT, ns.F), - (ns.SIGNED, ns.S), - (ns.NOEXP, ns.N), - (ns.PATH, ns.P), - (ns.LOCALEALPHA, ns.LA), - (ns.LOCALENUM, ns.LN), - (ns.LOCALE, ns.L), - (ns.IGNORECASE, ns.IC), - (ns.LOWERCASEFIRST, ns.LF), - (ns.GROUPLETTERS, ns.G), - (ns.UNGROUPLETTERS, ns.UG), - (ns.CAPITALFIRST, ns.C), - (ns.UNGROUPLETTERS, ns.CAPITALFIRST), - (ns.NANLAST, ns.NL), - (ns.COMPATIBILITYNORMALIZE, ns.CN), - (ns.NUMAFTER, ns.NA), - # Convenience - (ns.LOCALE, ns.LOCALEALPHA | ns.LOCALENUM), - (ns.REAL, ns.FLOAT | ns.SIGNED), - ], -) -def test_ns_enum_values_and_aliases(alg, value_or_alias): - assert alg == value_or_alias - - -def test_chain_functions_is_a_no_op_if_no_functions_are_given(): - x = 2345 - assert utils.chain_functions([])(x) is x - - -def test_chain_functions_does_one_function_if_one_function_is_given(): - x = "2345" - assert utils.chain_functions([len])(x) == 4 - - -def test_chain_functions_combines_functions_in_given_order(): - x = 2345 - assert utils.chain_functions([str, len, op_neg])(x) == -len(str(x)) - - -# Each test has an "example" version for demonstrative purposes, -# and a test that uses the hypothesis module. - - -def test_groupletters_returns_letters_with_lowercase_transform_of_letter_example(): - assert utils.groupletters("HELLO") == "hHeElLlLoO" - assert utils.groupletters("hello") == "hheelllloo" - - -@given(text().filter(bool)) -def test_groupletters_returns_letters_with_lowercase_transform_of_letter(x): - assert utils.groupletters(x) == "".join( - chain.from_iterable([py23_lower(y), y] for y in x) - ) - - -def test_sep_inserter_does_nothing_if_no_numbers_example(): - assert list(utils.sep_inserter(iter(["a", "b", "c"]), "")) == ["a", "b", "c"] - assert list(utils.sep_inserter(iter(["a"]), "")) == ["a"] - - -def test_sep_inserter_does_nothing_if_only_one_number_example(): - assert list(utils.sep_inserter(iter(["a", 5]), "")) == ["a", 5] - - -def test_sep_inserter_inserts_separator_string_between_two_numbers_example(): - assert list(utils.sep_inserter(iter([5, 9]), "")) == ["", 5, "", 9] - - -@given(lists(elements=text().filter(bool) | integers(), min_size=3)) -def test_sep_inserter_inserts_separator_between_two_numbers(x): - # Rather than just replicating the the results in a different - # algorithm, validate that the "shape" of the output is as expected. - result = list(utils.sep_inserter(iter(x), "")) - for i, pos in enumerate(result[1:-1], 1): - if pos == "": - assert isinstance(result[i - 1], py23_int) - assert isinstance(result[i + 1], py23_int) - - -def test_path_splitter_splits_path_string_by_separator_example(): - z = "/this/is/a/path" - assert tuple(utils.path_splitter(z)) == tuple(pathlib.Path(z).parts) - z = pathlib.Path("/this/is/a/path") - assert tuple(utils.path_splitter(z)) == tuple(pathlib.Path(z).parts) - - -@given(lists(sampled_from(string.ascii_letters), min_size=2).filter(all)) -def test_path_splitter_splits_path_string_by_separator(x): - z = py23_str(pathlib.Path(*x)) - assert tuple(utils.path_splitter(z)) == tuple(pathlib.Path(z).parts) - - -def test_path_splitter_splits_path_string_by_separator_and_removes_extension_example(): - z = "/this/is/a/path/file.exe" - y = tuple(pathlib.Path(z).parts) - assert tuple(utils.path_splitter(z)) == y[:-1] + ( - pathlib.Path(z).stem, - pathlib.Path(z).suffix, - ) - - -@given(lists(sampled_from(string.ascii_letters), min_size=3).filter(all)) -def test_path_splitter_splits_path_string_by_separator_and_removes_extension(x): - z = py23_str(pathlib.Path(*x[:-2])) + "." + x[-1] - y = tuple(pathlib.Path(z).parts) - assert tuple(utils.path_splitter(z)) == y[:-1] + ( - pathlib.Path(z).stem, - pathlib.Path(z).suffix, - ) - - -@given(integers()) -def test_py23_cmp(x): - assert py23_cmp(x, x) == 0 - assert py23_cmp(x, x + 1) < 0 - assert py23_cmp(x, x - 1) > 0 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8a7412b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,39 @@ +""" +Fixtures for pytest. +""" + +import locale + +import pytest + + +def load_locale(x): + """Convenience to load a locale, trying ISO8859-1 first.""" + try: + locale.setlocale(locale.LC_ALL, str("{}.ISO8859-1".format(x))) + except locale.Error: + locale.setlocale(locale.LC_ALL, str("{}.UTF-8".format(x))) + + +@pytest.fixture() +def with_locale_en_us(): + """Convenience to load the en_US locale - reset when complete.""" + orig = locale.getlocale() + yield load_locale("en_US") + locale.setlocale(locale.LC_ALL, orig) + + +@pytest.fixture() +def with_locale_de_de(): + """ + Convenience to load the de_DE locale - reset when complete - skip if missing. + """ + orig = locale.getlocale() + try: + load_locale("de_DE") + except locale.Error: + pytest.skip("requires de_DE locale to be installed") + else: + yield + finally: + locale.setlocale(locale.LC_ALL, orig) diff --git a/tests/profile_natsorted.py b/tests/profile_natsorted.py new file mode 100644 index 0000000..ec7037f --- /dev/null +++ b/tests/profile_natsorted.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +"""\ +This file contains functions to profile natsorted with different +inputs and different settings. +""" +from __future__ import print_function + +import cProfile +import locale +import sys + +try: + from natsort import ns, natsort_keygen + from natsort.compat.py23 import py23_range +except ImportError: + sys.path.insert(0, ".") + from natsort import ns, natsort_keygen + from natsort.compat.py23 import py23_range + +locale.setlocale(locale.LC_ALL, "en_US.UTF-8") + +# Samples to parse +number = 14695498 +int_string = "43493" +float_string = "-434.93e7" +plain_string = "hello world" +fancy_string = "7abba9342fdab" +a_path = "/p/Folder (1)/file (1).tar.gz" +some_bytes = b"these are bytes" +a_list = ["hello", "goodbye", "74"] + +basic_key = natsort_keygen() +real_key = natsort_keygen(alg=ns.REAL) +path_key = natsort_keygen(alg=ns.PATH) +locale_key = natsort_keygen(alg=ns.LOCALE) + + +def prof_time_to_generate(): + print("*** Generate Plain Key ***") + for _ in py23_range(100000): + natsort_keygen() + + +cProfile.run("prof_time_to_generate()", sort="time") + + +def prof_parsing(a, msg, key=basic_key): + print(msg) + for _ in py23_range(100000): + key(a) + + +cProfile.run( + 'prof_parsing(int_string, "*** Basic Call, Int as String ***")', sort="time" +) +cProfile.run( + 'prof_parsing(float_string, "*** Basic Call, Float as String ***")', sort="time" +) +cProfile.run('prof_parsing(float_string, "*** Real Call ***", real_key)', sort="time") +cProfile.run('prof_parsing(number, "*** Basic Call, Number ***")', sort="time") +cProfile.run( + 'prof_parsing(fancy_string, "*** Basic Call, Mixed String ***")', sort="time" +) +cProfile.run('prof_parsing(some_bytes, "*** Basic Call, Byte String ***")', sort="time") +cProfile.run('prof_parsing(a_path, "*** Path Call ***", path_key)', sort="time") +cProfile.run('prof_parsing(a_list, "*** Basic Call, Recursive ***")', sort="time") +cProfile.run( + 'prof_parsing("434,930,000 dollars", "*** Locale Call ***", locale_key)', + sort="time", +) diff --git a/tests/test_fake_fastnumbers.py b/tests/test_fake_fastnumbers.py new file mode 100644 index 0000000..1c0da66 --- /dev/null +++ b/tests/test_fake_fastnumbers.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +"""\ +Test the fake fastnumbers module. +""" +from __future__ import unicode_literals + +import unicodedata +from math import isnan + +from hypothesis import given +from hypothesis.strategies import floats, integers, text +from natsort.compat.fake_fastnumbers import fast_float, fast_int +from natsort.compat.py23 import PY_VERSION + +if PY_VERSION >= 3: + long = int + + +def is_float(x): + try: + float(x) + except ValueError: + try: + unicodedata.numeric(x) + except (ValueError, TypeError): + return False + else: + return True + else: + return True + + +def not_a_float(x): + return not is_float(x) + + +def is_int(x): + try: + return x.is_integer() + except AttributeError: + try: + long(x) + except ValueError: + try: + unicodedata.digit(x) + except (ValueError, TypeError): + return False + else: + return True + else: + return True + + +def not_an_int(x): + return not is_int(x) + + +# Each test has an "example" version for demonstrative purposes, +# and a test that uses the hypothesis module. + + +def test_fast_float_returns_nan_alternate_if_nan_option_is_given(): + assert fast_float("nan", nan=7) == 7 + + +def test_fast_float_converts_float_string_to_float_example(): + assert fast_float("45.8") == 45.8 + assert fast_float("-45") == -45.0 + assert fast_float("45.8e-2", key=len) == 45.8e-2 + assert isnan(fast_float("nan")) + assert isnan(fast_float("+nan")) + assert isnan(fast_float("-NaN")) + assert fast_float("۱۲.۱۲") == 12.12 + assert fast_float("-۱۲.۱۲") == -12.12 + + +@given(floats(allow_nan=False)) +def test_fast_float_converts_float_string_to_float(x): + assert fast_float(repr(x)) == x + + +def test_fast_float_leaves_string_as_is_example(): + assert fast_float("invalid") == "invalid" + + +@given(text().filter(not_a_float).filter(bool)) +def test_fast_float_leaves_string_as_is(x): + assert fast_float(x) == x + + +def test_fast_float_with_key_applies_to_string_example(): + assert fast_float("invalid", key=len) == len("invalid") + + +@given(text().filter(not_a_float).filter(bool)) +def test_fast_float_with_key_applies_to_string(x): + assert fast_float(x, key=len) == len(x) + + +def test_fast_int_leaves_float_string_as_is_example(): + assert fast_int("45.8") == "45.8" + assert fast_int("nan") == "nan" + assert fast_int("inf") == "inf" + + +@given(floats().filter(not_an_int)) +def test_fast_int_leaves_float_string_as_is(x): + assert fast_int(repr(x)) == repr(x) + + +def test_fast_int_converts_int_string_to_int_example(): + assert fast_int("-45") == -45 + assert fast_int("+45") == 45 + assert fast_int("۱۲") == 12 + assert fast_int("-۱۲") == -12 + + +@given(integers()) +def test_fast_int_converts_int_string_to_int(x): + assert fast_int(repr(x)) == x + + +def test_fast_int_leaves_string_as_is_example(): + assert fast_int("invalid") == "invalid" + + +@given(text().filter(not_an_int).filter(bool)) +def test_fast_int_leaves_string_as_is(x): + assert fast_int(x) == x + + +def test_fast_int_with_key_applies_to_string_example(): + assert fast_int("invalid", key=len) == len("invalid") + + +@given(text().filter(not_an_int).filter(bool)) +def test_fast_int_with_key_applies_to_string(x): + assert fast_int(x, key=len) == len(x) diff --git a/tests/test_final_data_transform_factory.py b/tests/test_final_data_transform_factory.py new file mode 100644 index 0000000..fb2eb25 --- /dev/null +++ b/tests/test_final_data_transform_factory.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +import pytest +from hypothesis import example, given +from hypothesis.strategies import floats, integers, text +from natsort.compat.py23 import py23_str +from natsort.ns_enum import NS_DUMB, ns +from natsort.utils import final_data_transform_factory + + +@pytest.mark.parametrize("alg", [ns.DEFAULT, ns.UNGROUPLETTERS, ns.LOCALE]) +@given(x=text(), y=floats(allow_nan=False, allow_infinity=False) | integers()) +@pytest.mark.usefixtures("with_locale_en_us") +def test_final_data_transform_factory_default(x, y, alg): + final_data_transform_func = final_data_transform_factory(alg, "", "::") + value = (x, y) + original_value = "".join(map(py23_str, value)) + result = final_data_transform_func(value, original_value) + assert result == value + + +@pytest.mark.parametrize( + "alg, func", + [ + (ns.UNGROUPLETTERS | ns.LOCALE, lambda x: x), + (ns.LOCALE | ns.UNGROUPLETTERS | NS_DUMB, lambda x: x), + (ns.LOCALE | ns.UNGROUPLETTERS | ns.LOWERCASEFIRST, lambda x: x), + ( + ns.LOCALE | ns.UNGROUPLETTERS | NS_DUMB | ns.LOWERCASEFIRST, + lambda x: x.swapcase(), + ), + ], +) +@given(x=text(), y=floats(allow_nan=False, allow_infinity=False) | integers()) +@example(x="İ", y=0) +@pytest.mark.usefixtures("with_locale_en_us") +def test_final_data_transform_factory_ungroup_and_locale(x, y, alg, func): + final_data_transform_func = final_data_transform_factory(alg, "", "::") + value = (x, y) + original_value = "".join(map(py23_str, value)) + result = final_data_transform_func(value, original_value) + if x: + expected = ((func(original_value[:1]),), value) + else: + expected = (("::",), value) + assert result == expected + + +def test_final_data_transform_factory_ungroup_and_locale_empty_tuple(): + final_data_transform_func = final_data_transform_factory(ns.UG | ns.L, "", "::") + assert final_data_transform_func((), "") == ((), ()) diff --git a/tests/test_input_string_transform_factory.py b/tests/test_input_string_transform_factory.py new file mode 100644 index 0000000..f2e9a7d --- /dev/null +++ b/tests/test_input_string_transform_factory.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +import pytest +from hypothesis import example, given +from hypothesis.strategies import integers, text +from natsort.compat.py23 import NEWPY +from natsort.ns_enum import NS_DUMB, ns +from natsort.utils import input_string_transform_factory + + +def lower(x): + """Call the appropriate lower method for the Python version.""" + if NEWPY: + return x.casefold() + else: + return x.lower() + + +def thousands_separated_int(n): + """Insert thousands separators in an int.""" + new_int = "" + for i, y in enumerate(reversed(n), 1): + new_int = y + new_int + # For every third digit, insert a thousands separator. + if i % 3 == 0 and i != len(n): + new_int = "," + new_int + return new_int + + +@given(text()) +def test_input_string_transform_factory_is_no_op_for_no_alg_options(x): + input_string_transform_func = input_string_transform_factory(ns.DEFAULT) + assert input_string_transform_func(x) is x + + +@pytest.mark.parametrize( + "alg, example_func", + [ + (ns.IGNORECASE, lower), + (NS_DUMB, lambda x: x.swapcase()), + (ns.LOWERCASEFIRST, lambda x: x.swapcase()), + (NS_DUMB | ns.LOWERCASEFIRST, lambda x: x), # No-op + (ns.IGNORECASE | ns.LOWERCASEFIRST, lambda x: lower(x.swapcase())), + ], +) +@given(x=text()) +def test_input_string_transform_factory(x, alg, example_func): + input_string_transform_func = input_string_transform_factory(alg) + assert input_string_transform_func(x) == example_func(x) + + +@example(12543642642534980) # 12,543,642,642,534,980 => 12543642642534980 +@given(x=integers(min_value=1000)) +@pytest.mark.usefixtures("with_locale_en_us") +def test_input_string_transform_factory_cleans_thousands(x): + int_str = str(x).rstrip("lL") + thousands_int_str = thousands_separated_int(int_str) + assert thousands_int_str.replace(",", "") != thousands_int_str + + input_string_transform_func = input_string_transform_factory(ns.LOCALE) + assert input_string_transform_func(thousands_int_str) == int_str + + # Using LOCALEALPHA does not affect numbers. + input_string_transform_func_no_op = input_string_transform_factory(ns.LOCALEALPHA) + assert input_string_transform_func_no_op(thousands_int_str) == thousands_int_str + + +# These might be too much to test with hypothesis. + + +@pytest.mark.parametrize( + "x, expected", + [ + ("12,543,642642.5345,34980", "12543,642642.5345,34980"), + ("12,59443,642,642.53,4534980", "12,59443,642642.53,4534980"), # No change + ("12543,642,642.5,34534980", "12543,642642.5,34534980"), + ], +) +@pytest.mark.usefixtures("with_locale_en_us") +def test_input_string_transform_factory_handles_us_locale(x, expected): + input_string_transform_func = input_string_transform_factory(ns.LOCALE) + assert input_string_transform_func(x) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.LOCALE, "1543,753"), # Does nothing without FLOAT + (ns.LOCALE | ns.FLOAT, "1543.753"), + (ns.LOCALEALPHA, "1543,753"), # LOCALEALPHA won't do anything, need LOCALENUM + ], +) +@pytest.mark.usefixtures("with_locale_de_de") +def test_input_string_transform_factory_handles_german_locale(alg, expected): + input_string_transform_func = input_string_transform_factory(alg) + assert input_string_transform_func("1543,753") == expected + + +@pytest.mark.usefixtures("with_locale_de_de") +def test_input_string_transform_factory_does_nothing_with_non_num_input(): + input_string_transform_func = input_string_transform_factory(ns.LOCALE | ns.FLOAT) + expected = "154s,t53" + assert input_string_transform_func("154s,t53") == expected diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..4559b92 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- +"""\ +Test the natsort command-line tool functions. +""" +from __future__ import print_function, unicode_literals + +import re +import sys + +import pytest +from hypothesis import given +from hypothesis.strategies import data, floats, integers, lists +from natsort.__main__ import ( + check_filters, + keep_entry_range, + keep_entry_value, + main, + range_check, + sort_and_print_entries, +) + + +def test_main_passes_default_arguments_with_no_command_line_options(mocker): + p = mocker.patch("natsort.__main__.sort_and_print_entries") + main("num-2", "num-6", "num-1") + args = p.call_args[0][1] + assert not args.paths + assert args.filter is None + assert args.reverse_filter is None + assert args.exclude is None + assert not args.reverse + assert args.number_type == "int" + assert not args.signed + assert args.exp + assert not args.locale + + +def test_main_passes_arguments_with_all_command_line_options(mocker): + arguments = ["--paths", "--reverse", "--locale"] + arguments.extend(["--filter", "4", "10"]) + arguments.extend(["--reverse-filter", "100", "110"]) + arguments.extend(["--number-type", "float"]) + arguments.extend(["--noexp", "--sign"]) + arguments.extend(["--exclude", "34"]) + arguments.extend(["--exclude", "35"]) + arguments.extend(["num-2", "num-6", "num-1"]) + p = mocker.patch("natsort.__main__.sort_and_print_entries") + main(*arguments) + args = p.call_args[0][1] + assert args.paths + assert args.filter == [(4.0, 10.0)] + assert args.reverse_filter == [(100.0, 110.0)] + assert args.exclude == [34, 35] + assert args.reverse + assert args.number_type == "float" + assert args.signed + assert not args.exp + assert args.locale + + +class Args: + """A dummy class to simulate the argparse Namespace object""" + + def __init__(self, filt, reverse_filter, exclude, as_path, reverse): + self.filter = filt + self.reverse_filter = reverse_filter + self.exclude = exclude + self.reverse = reverse + self.number_type = "float" + self.signed = True + self.exp = True + self.paths = as_path + self.locale = 0 + + +mock_print = "__builtin__.print" if sys.version[0] == "2" else "builtins.print" + +entries = [ + "tmp/a57/path2", + "tmp/a23/path1", + "tmp/a1/path1", + "tmp/a1 (1)/path1", + "tmp/a130/path1", + "tmp/a64/path1", + "tmp/a64/path2", +] + + +@pytest.mark.parametrize( + "options, order", + [ + # Defaults, all options false + # tmp/a1 (1)/path1 + # tmp/a1/path1 + # tmp/a23/path1 + # tmp/a57/path2 + # tmp/a64/path1 + # tmp/a64/path2 + # tmp/a130/path1 + ([None, None, False, False, False], [3, 2, 1, 0, 5, 6, 4]), + # Path option True + # tmp/a1/path1 + # tmp/a1 (1)/path1 + # tmp/a23/path1 + # tmp/a57/path2 + # tmp/a64/path1 + # tmp/a64/path2 + # tmp/a130/path1 + ([None, None, False, True, False], [2, 3, 1, 0, 5, 6, 4]), + # Filter option keeps only within range + # tmp/a23/path1 + # tmp/a57/path2 + # tmp/a64/path1 + # tmp/a64/path2 + ([[(20, 100)], None, False, False, False], [1, 0, 5, 6]), + # Reverse filter, exclude in range + # tmp/a1/path1 + # tmp/a1 (1)/path1 + # tmp/a130/path1 + ([None, [(20, 100)], False, True, False], [2, 3, 4]), + # Exclude given values with exclude list + # tmp/a1/path1 + # tmp/a1 (1)/path1 + # tmp/a57/path2 + # tmp/a64/path1 + # tmp/a64/path2 + ([None, None, [23, 130], True, False], [2, 3, 0, 5, 6]), + # Reverse order + # tmp/a130/path1 + # tmp/a64/path2 + # tmp/a64/path1 + # tmp/a57/path2 + # tmp/a23/path1 + # tmp/a1 (1)/path1 + # tmp/a1/path1 + ([None, None, False, True, True], reversed([2, 3, 1, 0, 5, 6, 4])), + ], +) +def test_sort_and_print_entries(options, order, mocker): + p = mocker.patch(mock_print) + sort_and_print_entries(entries, Args(*options)) + e = [mocker.call(entries[i]) for i in order] + p.assert_has_calls(e) + + +# Each test has an "example" version for demonstrative purposes, +# and a test that uses the hypothesis module. + + +def test_range_check_returns_range_as_is_but_with_floats_example(): + assert range_check(10, 11) == (10.0, 11.0) + assert range_check(6.4, 30) == (6.4, 30.0) + + +@given(x=floats(allow_nan=False, min_value=-1E8, max_value=1E8) | integers(), d=data()) +def test_range_check_returns_range_as_is_if_first_is_less_than_second(x, d): + # Pull data such that the first is less than the second. + if isinstance(x, float): + y = d.draw(floats(min_value=x + 1.0, max_value=1E9, allow_nan=False)) + else: + y = d.draw(integers(min_value=x + 1)) + assert range_check(x, y) == (x, y) + + +def test_range_check_raises_value_error_if_second_is_less_than_first_example(): + with pytest.raises(ValueError, match="low >= high"): + range_check(7, 2) + + +@given(x=floats(allow_nan=False), d=data()) +def test_range_check_raises_value_error_if_second_is_less_than_first(x, d): + # Pull data such that the first is greater than or equal to the second. + y = d.draw(floats(max_value=x, allow_nan=False)) + with pytest.raises(ValueError, match="low >= high"): + range_check(x, y) + + +def test_check_filters_returns_none_if_filter_evaluates_to_false(): + assert check_filters(()) is None + assert check_filters(False) is None + assert check_filters(None) is None + + +def test_check_filters_returns_input_as_is_if_filter_is_valid_example(): + assert check_filters([(6, 7)]) == [(6, 7)] + assert check_filters([(6, 7), (2, 8)]) == [(6, 7), (2, 8)] + + +@given(x=lists(integers(), min_size=1), d=data()) +def test_check_filters_returns_input_as_is_if_filter_is_valid(x, d): + # ensure y is element-wise greater than x + y = [d.draw(integers(min_value=val + 1)) for val in x] + assert check_filters(list(zip(x, y))) == [(i, j) for i, j in zip(x, y)] + + +def test_check_filters_raises_value_error_if_filter_is_invalid_example(): + with pytest.raises(ValueError, match="Error in --filter: low >= high"): + check_filters([(7, 2)]) + + +@given(x=lists(integers(), min_size=1), d=data()) +def test_check_filters_raises_value_error_if_filter_is_invalid(x, d): + # ensure y is element-wise less than or equal to x + y = [d.draw(integers(max_value=val)) for val in x] + with pytest.raises(ValueError, match="Error in --filter: low >= high"): + check_filters(list(zip(x, y))) + + +@pytest.mark.parametrize( + "lows, highs, truth", + # 1. Any portion is between the bounds => True. + # 2. Any portion is between any bounds => True. + # 3. No portion is between the bounds => False. + [([0], [100], True), ([1, 88], [20, 90], True), ([1], [20], False)], +) +def test_keep_entry_range(lows, highs, truth): + assert keep_entry_range("a56b23c89", lows, highs, int, re.compile(r"\d+")) is truth + + +# 1. Values not in entry => True. 2. Values in entry => False. +@pytest.mark.parametrize("values, truth", [([100, 45], True), ([23], False)]) +def test_keep_entry_value(values, truth): + assert keep_entry_value("a56b23c89", values, int, re.compile(r"\d+")) is truth diff --git a/tests/test_natsort_cmp.py b/tests/test_natsort_cmp.py new file mode 100644 index 0000000..41a252f --- /dev/null +++ b/tests/test_natsort_cmp.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +# pylint: disable=unused-variable +"""These test the natcmp() function. + +Note that these tests are only relevant for Python version < 3. +""" +from functools import partial + +import pytest +from hypothesis import given +from hypothesis.strategies import floats, integers, lists +from natsort import ns +from natsort.compat.py23 import PY_VERSION, py23_cmp + +if PY_VERSION < 3: + from natsort import natcmp + + +class Comparable(object): + """Stub class for testing natcmp functionality.""" + + def __init__(self, value): + self.value = value + + def __cmp__(self, other): + return natcmp(self.value, other.value) + + +@pytest.mark.skipif(PY_VERSION >= 3.0, reason="cmp() deprecated in Python 3") +class TestNatCmp: + + def test_classes_can_be_compared(self): + one = Comparable("1") + two = Comparable("2") + another_two = Comparable("2") + ten = Comparable("10") + assert ten > two == another_two > one + + def test_keys_are_being_cached(self, mocker): + natcmp.cached_keys = {} + assert len(natcmp.cached_keys) == 0 + natcmp(0, 0) + assert len(natcmp.cached_keys) == 1 + natcmp(0, 0) + assert len(natcmp.cached_keys) == 1 + + with mocker.patch("natsort.compat.locale.dumb_sort", return_value=False): + natcmp(0, 0, alg=ns.L) + assert len(natcmp.cached_keys) == 2 + natcmp(0, 0, alg=ns.L) + assert len(natcmp.cached_keys) == 2 + + with mocker.patch("natsort.compat.locale.dumb_sort", return_value=True): + natcmp(0, 0, alg=ns.L) + assert len(natcmp.cached_keys) == 3 + natcmp(0, 0, alg=ns.L) + assert len(natcmp.cached_keys) == 3 + + def test_illegal_algorithm_raises_error(self): + with pytest.raises(ValueError): + natcmp(0, 0, alg="Just random stuff") + + def test_classes_can_utilize_max_or_min(self): + comparables = [Comparable(i) for i in range(10)] + + assert max(comparables) == comparables[-1] + assert min(comparables) == comparables[0] + + @given(integers(), integers()) + def test_natcmp_works_the_same_for_integers_as_cmp(self, x, y): + assert py23_cmp(x, y) == natcmp(x, y) + + @given(floats(allow_nan=False), floats(allow_nan=False)) + def test_natcmp_works_the_same_for_floats_as_cmp(self, x, y): + assert py23_cmp(x, y) == natcmp(x, y) + + @given(lists(elements=integers())) + def test_sort_strings_with_numbers(self, a_list): + strings = [str(var) for var in a_list] + # noinspection PyArgumentList + natcmp_sorted = sorted(strings, cmp=partial(natcmp, alg=ns.SIGNED)) + + assert sorted(a_list) == [int(var) for var in natcmp_sorted] diff --git a/tests/test_natsort_key.py b/tests/test_natsort_key.py new file mode 100644 index 0000000..e0c442e --- /dev/null +++ b/tests/test_natsort_key.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +import pytest +from hypothesis import given +from hypothesis.strategies import binary, floats, integers, lists, text +from natsort.compat.py23 import PY_VERSION, py23_str +from natsort.utils import natsort_key + +if PY_VERSION >= 3: + long = int + + +def str_func(x): + if isinstance(x, py23_str): + return x + else: + raise TypeError("Not a str!") + + +def fail(_): + raise AssertionError("This should never be reached!") + + +@given(floats(allow_nan=False) | integers()) +def test_natsort_key_with_numeric_input_takes_number_path(x): + assert natsort_key(x, None, str_func, fail, lambda y: y) is x + + +@pytest.mark.skipif(PY_VERSION < 3, reason="only valid on python3") +@given(binary().filter(bool)) +def test_natsort_key_with_bytes_input_takes_bytes_path(x): + assert natsort_key(x, None, str_func, lambda y: y, fail) is x + + +@given(text()) +def test_natsort_key_with_text_input_takes_string_path(x): + assert natsort_key(x, None, str_func, fail, fail) is x + + +@given(lists(elements=text(), min_size=1, max_size=10)) +def test_natsort_key_with_nested_input_takes_nested_path(x): + assert natsort_key(x, None, str_func, fail, fail) == tuple(x) + + +@given(text()) +def test_natsort_key_with_key_argument_applies_key_before_processing(x): + assert natsort_key(x, len, str_func, fail, lambda y: y) == len(x) diff --git a/tests/test_natsort_keygen.py b/tests/test_natsort_keygen.py new file mode 100644 index 0000000..baa4c3e --- /dev/null +++ b/tests/test_natsort_keygen.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +"""\ +Here are a collection of examples of how this module can be used. +See the README or the natsort homepage for more details. +""" +from __future__ import print_function, unicode_literals + +import pytest +from natsort import natsort_key, natsort_keygen, natsorted, ns +from natsort.compat.locale import get_strxfrm, null_string_locale +from natsort.compat.py23 import PY_VERSION + + +@pytest.fixture +def arbitrary_input(): + return ["6A-5.034e+1", "/Folder (1)/Foo", 56.7] + + +@pytest.fixture +def bytes_input(): + return b"6A-5.034e+1" + + +def test_natsort_keygen_demonstration(): + original_list = ["a50", "a51.", "a50.31", "a50.4", "a5.034e1", "a50.300"] + copy_of_list = original_list[:] + original_list.sort(key=natsort_keygen(alg=ns.F)) + # natsorted uses the output of natsort_keygen under the hood. + assert original_list == natsorted(copy_of_list, alg=ns.F) + + +def test_natsort_key_public(): + assert natsort_key("a-5.034e2") == ("a-", 5, ".", 34, "e", 2) + + +def test_natsort_keygen_with_invalid_alg_input_raises_value_error(): + # Invalid arguments give the correct response + with pytest.raises(ValueError, match="'alg' argument"): + natsort_keygen(None, "1") + + +@pytest.mark.parametrize( + "alg, expected", + [(ns.DEFAULT, ("a-", 5, ".", 34, "e", 1)), (ns.FLOAT | ns.SIGNED, ("a", -50.34))], +) +def test_natsort_keygen_returns_natsort_key_that_parses_input(alg, expected): + ns_key = natsort_keygen(alg=alg) + assert ns_key("a-5.034e1") == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + ( + ns.DEFAULT, + (("", 6, "A-", 5, ".", 34, "e+", 1), ("/Folder (", 1, ")/Foo"), ("", 56.7)), + ), + ( + ns.IGNORECASE, + (("", 6, "a-", 5, ".", 34, "e+", 1), ("/folder (", 1, ")/foo"), ("", 56.7)), + ), + (ns.REAL, (("", 6.0, "A", -50.34), ("/Folder (", 1.0, ")/Foo"), ("", 56.7))), + ( + ns.LOWERCASEFIRST | ns.FLOAT | ns.NOEXP, + ( + ("", 6.0, "a-", 5.034, "E+", 1.0), + ("/fOLDER (", 1.0, ")/fOO"), + ("", 56.7), + ), + ), + ( + ns.PATH | ns.GROUPLETTERS, + ( + (("", 6, "aA--", 5, "..", 34, "ee++", 1),), + (("//",), ("fFoollddeerr ((", 1, "))"), ("fFoooo",)), + (("", 56.7),), + ), + ), + ], +) +def test_natsort_keygen_handles_arbitrary_input(arbitrary_input, alg, expected): + ns_key = natsort_keygen(alg=alg) + assert ns_key(arbitrary_input) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.DEFAULT, (b"6A-5.034e+1",)), + (ns.IGNORECASE, (b"6a-5.034e+1",)), + (ns.REAL, (b"6A-5.034e+1",)), + (ns.LOWERCASEFIRST | ns.FLOAT | ns.NOEXP, (b"6A-5.034e+1",)), + (ns.PATH | ns.GROUPLETTERS, ((b"6A-5.034e+1",),)), + ], +) +@pytest.mark.skipif(PY_VERSION < 3.0, reason="special bytes handling only on Python3") +def test_natsort_keygen_handles_bytes_input(bytes_input, alg, expected): + ns_key = natsort_keygen(alg=alg) + assert ns_key(bytes_input) == expected + + +@pytest.mark.parametrize( + "alg, expected, is_dumb", + [ + ( + ns.LOCALE, + ( + (null_string_locale, 6, "A-", 5, ".", 34, "e+", 1), + ("/Folder (", 1, ")/Foo"), + (null_string_locale, 56.7), + ), + False, + ), + ( + ns.LOCALE, + ( + (null_string_locale, 6, "aa--", 5, "..", 34, "eE++", 1), + ("//ffoOlLdDeErR ((", 1, "))//ffoOoO"), + (null_string_locale, 56.7), + ), + True, + ), + ( + ns.LOCALE | ns.CAPITALFIRST, + ( + (("",), (null_string_locale, 6, "A-", 5, ".", 34, "e+", 1)), + (("/",), ("/Folder (", 1, ")/Foo")), + (("",), (null_string_locale, 56.7)), + ), + False, + ), + ], +) +@pytest.mark.usefixtures("with_locale_en_us") +def test_natsort_keygen_with_locale(mocker, arbitrary_input, alg, expected, is_dumb): + # First, apply the correct strxfrm function to the string values. + strxfrm = get_strxfrm() + expected = [list(sub) for sub in expected] + try: + for i in (2, 4, 6): + expected[0][i] = strxfrm(expected[0][i]) + for i in (0, 2): + expected[1][i] = strxfrm(expected[1][i]) + expected = tuple(tuple(sub) for sub in expected) + except IndexError: # ns.LOCALE | ns.CAPITALFIRST + expected = [[list(subsub) for subsub in sub] for sub in expected] + for i in (2, 4, 6): + expected[0][1][i] = strxfrm(expected[0][1][i]) + for i in (0, 2): + expected[1][1][i] = strxfrm(expected[1][1][i]) + expected = tuple(tuple(tuple(subsub) for subsub in sub) for sub in expected) + + with mocker.patch("natsort.compat.locale.dumb_sort", return_value=is_dumb): + ns_key = natsort_keygen(alg=alg) + assert ns_key(arbitrary_input) == expected + + +@pytest.mark.parametrize( + "alg, is_dumb", + [(ns.LOCALE, False), (ns.LOCALE, True), (ns.LOCALE | ns.CAPITALFIRST, False)], +) +@pytest.mark.skipif(PY_VERSION < 3.0, reason="special bytes handling only on Python3") +@pytest.mark.usefixtures("with_locale_en_us") +def test_natsort_keygen_with_locale_bytes(mocker, bytes_input, alg, is_dumb): + expected = (b"6A-5.034e+1",) + with mocker.patch("natsort.compat.locale.dumb_sort", return_value=is_dumb): + ns_key = natsort_keygen(alg=alg) + assert ns_key(bytes_input) == expected diff --git a/tests/test_natsorted.py b/tests/test_natsorted.py new file mode 100644 index 0000000..f21cec8 --- /dev/null +++ b/tests/test_natsorted.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +"""\ +Here are a collection of examples of how this module can be used. +See the README or the natsort homepage for more details. +""" +from __future__ import print_function, unicode_literals + +from operator import itemgetter + +import pytest +from natsort import as_utf8, natsorted, ns +from natsort.compat.py23 import PY_VERSION +from pytest import raises + + +@pytest.fixture +def float_list(): + return ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"] + + +@pytest.fixture +def fruit_list(): + return ["Apple", "corn", "Corn", "Banana", "apple", "banana"] + + +@pytest.fixture +def mixed_list(): + return ["Ä", "0", "ä", 3, "b", 1.5, "2", "Z"] + + +def test_natsorted_numbers_in_ascending_order(): + given = ["a2", "a5", "a9", "a1", "a4", "a10", "a6"] + expected = ["a1", "a2", "a4", "a5", "a6", "a9", "a10"] + assert natsorted(given) == expected + + +def test_natsorted_can_sort_as_signed_floats_with_exponents(float_list): + expected = ["a-50", "a50", "a50.300", "a50.31", "a5.034e1", "a50.4", "a51."] + assert natsorted(float_list, alg=ns.REAL) == expected + + +@pytest.mark.parametrize( + # UNSIGNED is default + "alg", + [ns.NOEXP | ns.FLOAT | ns.UNSIGNED, ns.NOEXP | ns.FLOAT], +) +def test_natsorted_can_sort_as_unsigned_and_ignore_exponents(float_list, alg): + expected = ["a5.034e1", "a50", "a50.300", "a50.31", "a50.4", "a51.", "a-50"] + assert natsorted(float_list, alg=alg) == expected + + +# DEFAULT and INT are all equivalent. +@pytest.mark.parametrize("alg", [ns.DEFAULT, ns.INT]) +def test_natsorted_can_sort_as_unsigned_ints_which_is_default(float_list, alg): + expected = ["a5.034e1", "a50", "a50.4", "a50.31", "a50.300", "a51.", "a-50"] + assert natsorted(float_list, alg=alg) == expected + + +def test_natsorted_can_sort_as_signed_ints(float_list): + expected = ["a-50", "a5.034e1", "a50", "a50.4", "a50.31", "a50.300", "a51."] + assert natsorted(float_list, alg=ns.SIGNED) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [(ns.UNSIGNED, ["a7", "a+2", "a-5"]), (ns.SIGNED, ["a-5", "a+2", "a7"])], +) +def test_natsorted_can_sort_with_or_without_accounting_for_sign(alg, expected): + given = ["a-5", "a7", "a+2"] + assert natsorted(given, alg=alg) == expected + + +def test_natsorted_can_sort_as_version_numbers(): + given = ["1.9.9a", "1.11", "1.9.9b", "1.11.4", "1.10.1"] + expected = ["1.9.9a", "1.9.9b", "1.10.1", "1.11", "1.11.4"] + assert natsorted(given) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.DEFAULT, ["0", 1.5, "2", 3, "Ä", "Z", "ä", "b"]), + (ns.NUMAFTER, ["Ä", "Z", "ä", "b", "0", 1.5, "2", 3]), + ], +) +def test_natsorted_handles_mixed_types(mixed_list, alg, expected): + assert natsorted(mixed_list, alg=alg) == expected + + +@pytest.mark.parametrize( + "alg, expected, slc", + [ + (ns.DEFAULT, [float("nan"), 5, "25", 1E40], slice(1, None)), + (ns.NANLAST, [5, "25", 1E40, float("nan")], slice(None, 3)), + ], +) +def test_natsorted_handles_nan(alg, expected, slc): + given = ["25", 5, float("nan"), 1E40] + # The slice is because NaN != NaN + # noinspection PyUnresolvedReferences + assert natsorted(given, alg=alg)[slc] == expected[slc] + + +@pytest.mark.skipif(PY_VERSION < 3.0, reason="error is only raised on Python 3") +def test_natsorted_with_mixed_bytes_and_str_input_raises_type_error(): + with raises(TypeError, match="bytes"): + natsorted(["ä", b"b"]) + + # ...unless you use as_utf (or some other decoder). + assert natsorted(["ä", b"b"], key=as_utf8) == ["ä", b"b"] + + +def test_natsorted_raises_type_error_for_non_iterable_input(): + with raises(TypeError, match="'int' object is not iterable"): + natsorted(100) + + +def test_natsorted_recurses_into_nested_lists(): + given = [["a1", "a5"], ["a1", "a40"], ["a10", "a1"], ["a2", "a5"]] + expected = [["a1", "a5"], ["a1", "a40"], ["a2", "a5"], ["a10", "a1"]] + assert natsorted(given) == expected + + +def test_natsorted_applies_key_to_each_list_element_before_sorting_list(): + given = [("a", "num3"), ("b", "num5"), ("c", "num2")] + expected = [("c", "num2"), ("a", "num3"), ("b", "num5")] + assert natsorted(given, key=itemgetter(1)) == expected + + +def test_natsorted_returns_list_in_reversed_order_with_reverse_option(float_list): + expected = natsorted(float_list)[::-1] + assert natsorted(float_list, reverse=True) == expected + + +def test_natsorted_handles_filesystem_paths(): + given = [ + "/p/Folder (10)/file.tar.gz", + "/p/Folder/file.tar.gz", + "/p/Folder (1)/file (1).tar.gz", + "/p/Folder (1)/file.tar.gz", + ] + expected_correct = [ + "/p/Folder/file.tar.gz", + "/p/Folder (1)/file.tar.gz", + "/p/Folder (1)/file (1).tar.gz", + "/p/Folder (10)/file.tar.gz", + ] + expected_incorrect = [ + "/p/Folder (1)/file (1).tar.gz", + "/p/Folder (1)/file.tar.gz", + "/p/Folder (10)/file.tar.gz", + "/p/Folder/file.tar.gz", + ] + # Is incorrect by default. + assert natsorted(given) == expected_incorrect + # Need ns.PATH to make it correct. + assert natsorted(given, alg=ns.PATH) == expected_correct + + +def test_natsorted_handles_numbers_and_filesystem_paths_simultaneously(): + # You can sort paths and numbers, not that you'd want to + given = ["/Folder (9)/file.exe", 43] + expected = [43, "/Folder (9)/file.exe"] + assert natsorted(given, alg=ns.PATH) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.DEFAULT, ["Apple", "Banana", "Corn", "apple", "banana", "corn"]), + (ns.IGNORECASE, ["Apple", "apple", "Banana", "banana", "corn", "Corn"]), + (ns.LOWERCASEFIRST, ["apple", "banana", "corn", "Apple", "Banana", "Corn"]), + (ns.GROUPLETTERS, ["Apple", "apple", "Banana", "banana", "Corn", "corn"]), + (ns.G | ns.LF, ["apple", "Apple", "banana", "Banana", "corn", "Corn"]), + ], +) +def test_natsorted_supports_case_handling(alg, expected, fruit_list): + assert natsorted(fruit_list, alg=alg) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.DEFAULT, [("A5", "a6"), ("a3", "a1")]), + (ns.LOWERCASEFIRST, [("a3", "a1"), ("A5", "a6")]), + (ns.IGNORECASE, [("a3", "a1"), ("A5", "a6")]), + ], +) +def test_natsorted_supports_nested_case_handling(alg, expected): + given = [("A5", "a6"), ("a3", "a1")] + assert natsorted(given, alg=alg) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.DEFAULT, ["apple", "Apple", "banana", "Banana", "corn", "Corn"]), + (ns.CAPITALFIRST, ["Apple", "Banana", "Corn", "apple", "banana", "corn"]), + (ns.LOWERCASEFIRST, ["Apple", "apple", "Banana", "banana", "Corn", "corn"]), + (ns.C | ns.LF, ["apple", "banana", "corn", "Apple", "Banana", "Corn"]), + ], +) +@pytest.mark.usefixtures("with_locale_en_us") +def test_natsorted_can_sort_using_locale(fruit_list, alg, expected): + assert natsorted(fruit_list, alg=ns.LOCALE | alg) == expected + + +@pytest.mark.usefixtures("with_locale_en_us") +def test_natsorted_can_sort_locale_specific_numbers_en(): + given = ["c", "a5,467.86", "ä", "b", "a5367.86", "a5,6", "a5,50"] + expected = ["a5,6", "a5,50", "a5367.86", "a5,467.86", "ä", "b", "c"] + assert natsorted(given, alg=ns.LOCALE | ns.F) == expected + + +@pytest.mark.usefixtures("with_locale_de_de") +def test_natsorted_can_sort_locale_specific_numbers_de(): + given = ["c", "a5.467,86", "ä", "b", "a5367.86", "a5,6", "a5,50"] + expected = ["a5,50", "a5,6", "a5367.86", "a5.467,86", "ä", "b", "c"] + assert natsorted(given, alg=ns.LOCALE | ns.F) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.DEFAULT, ["0", 1.5, "2", 3, "ä", "Ä", "b", "Z"]), + (ns.NUMAFTER, ["ä", "Ä", "b", "Z", "0", 1.5, "2", 3]), + (ns.UNGROUPLETTERS, ["0", 1.5, "2", 3, "Ä", "Z", "ä", "b"]), + (ns.UG | ns.NA, ["Ä", "Z", "ä", "b", "0", 1.5, "2", 3]), + # Adding PATH changes nothing. + (ns.PATH, ["0", 1.5, "2", 3, "ä", "Ä", "b", "Z"]), + (ns.PATH | ns.NUMAFTER, ["ä", "Ä", "b", "Z", "0", 1.5, "2", 3]), + (ns.PATH | ns.UNGROUPLETTERS, ["0", 1.5, "2", 3, "Ä", "Z", "ä", "b"]), + (ns.PATH | ns.UG | ns.NA, ["Ä", "Z", "ä", "b", "0", 1.5, "2", 3]), + ], +) +@pytest.mark.usefixtures("with_locale_en_us") +def test_natsorted_handles_mixed_types_with_locale(mixed_list, alg, expected): + assert natsorted(mixed_list, alg=ns.LOCALE | alg) == expected + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.DEFAULT, ["73", "5039", "Banana", "apple", "corn", "~~~~~~"]), + (ns.NUMAFTER, ["Banana", "apple", "corn", "~~~~~~", "73", "5039"]), + ], +) +def test_natsorted_sorts_an_odd_collection_of_strings(alg, expected): + given = ["apple", "Banana", "73", "5039", "corn", "~~~~~~"] + assert natsorted(given, alg=alg) == expected + + +def test_natsorted_sorts_mixed_ascii_and_non_ascii_numbers(): + given = [ + "1st street", + "10th street", + "2nd street", + "2 street", + "1 street", + "1street", + "11 street", + "street 2", + "street 1", + "Street 11", + "۲ street", + "۱ street", + "۱street", + "۱۲street", + "۱۱ street", + "street ۲", + "street ۱", + "street ۱", + "street ۱۲", + "street ۱۱", + ] + expected = [ + "1 street", + "۱ street", + "1st street", + "1street", + "۱street", + "2 street", + "۲ street", + "2nd street", + "10th street", + "11 street", + "۱۱ street", + "۱۲street", + "street 1", + "street ۱", + "street ۱", + "street 2", + "street ۲", + "Street 11", + "street ۱۱", + "street ۱۲", + ] + assert natsorted(given, alg=ns.IGNORECASE) == expected diff --git a/tests/test_natsorted_convenience.py b/tests/test_natsorted_convenience.py new file mode 100644 index 0000000..876a4e7 --- /dev/null +++ b/tests/test_natsorted_convenience.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +"""\ +Here are a collection of examples of how this module can be used. +See the README or the natsort homepage for more details. +""" +from __future__ import print_function, unicode_literals + +from operator import itemgetter + +import pytest +from natsort import ( + as_ascii, + as_utf8, + decoder, + humansorted, + index_humansorted, + index_natsorted, + index_realsorted, + natsorted, + ns, + order_by_index, + realsorted, +) +from natsort.compat.py23 import PY_VERSION + + +@pytest.fixture +def version_list(): + return ["1.9.9a", "1.11", "1.9.9b", "1.11.4", "1.10.1"] + + +@pytest.fixture +def float_list(): + return ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"] + + +@pytest.fixture +def fruit_list(): + return ["Apple", "corn", "Corn", "Banana", "apple", "banana"] + + +def test_decoder_returns_function_that_can_decode_bytes_but_return_non_bytes_as_is(): + func = decoder("latin1") + str_obj = "bytes" + int_obj = 14 + assert func(b"bytes") == str_obj + assert func(int_obj) is int_obj # returns as-is, same object ID + if PY_VERSION >= 3: + assert ( + func(str_obj) is str_obj + ) # same object returned on Python3 b/c only bytes has decode + else: + assert func(str_obj) is not str_obj + assert ( + func(str_obj) == str_obj + ) # not same object on Python2 because str can decode + + +def test_as_ascii_converts_bytes_to_ascii(): + assert decoder("ascii")(b"bytes") == as_ascii(b"bytes") + + +def test_as_utf8_converts_bytes_to_utf8(): + assert decoder("utf8")(b"bytes") == as_utf8(b"bytes") + + +def test_realsorted_is_identical_to_natsorted_with_real_alg(float_list): + assert realsorted(float_list) == natsorted(float_list, alg=ns.REAL) + + +@pytest.mark.usefixtures("with_locale_en_us") +def test_humansorted_is_identical_to_natsorted_with_locale_alg(fruit_list): + assert humansorted(fruit_list) == natsorted(fruit_list, alg=ns.LOCALE) + + +def test_index_natsorted_returns_integer_list_of_sort_order_for_input_list(): + given = ["num3", "num5", "num2"] + other = ["foo", "bar", "baz"] + index = index_natsorted(given) + assert index == [2, 0, 1] + assert [given[i] for i in index] == ["num2", "num3", "num5"] + assert [other[i] for i in index] == ["baz", "foo", "bar"] + + +def test_index_natsorted_reverse(): + given = ["num3", "num5", "num2"] + assert index_natsorted(given, reverse=True) == index_natsorted(given)[::-1] + + +def test_index_natsorted_applies_key_function_before_sorting(): + given = [("a", "num3"), ("b", "num5"), ("c", "num2")] + expected = [2, 0, 1] + assert index_natsorted(given, key=itemgetter(1)) == expected + + +def test_index_realsorted_is_identical_to_index_natsorted_with_real_alg(float_list): + assert index_realsorted(float_list) == index_natsorted(float_list, alg=ns.REAL) + + +@pytest.mark.usefixtures("with_locale_en_us") +def test_index_humansorted_is_identical_to_index_natsorted_with_locale_alg(fruit_list): + assert index_humansorted(fruit_list) == index_natsorted(fruit_list, alg=ns.LOCALE) + + +def test_order_by_index_sorts_list_according_to_order_of_integer_list(): + given = ["num3", "num5", "num2"] + index = [2, 0, 1] + expected = [given[i] for i in index] + assert expected == ["num2", "num3", "num5"] + assert order_by_index(given, index) == expected + + +def test_order_by_index_returns_generator_with_iter_true(): + given = ["num3", "num5", "num2"] + index = [2, 0, 1] + assert order_by_index(given, index, True) != [given[i] for i in index] + assert list(order_by_index(given, index, True)) == [given[i] for i in index] diff --git a/tests/test_ns_enum.py b/tests/test_ns_enum.py new file mode 100644 index 0000000..1d3803b --- /dev/null +++ b/tests/test_ns_enum.py @@ -0,0 +1,44 @@ +from natsort import ns + + +def test_ns_enum(): + enum_name_values = [ + ("FLOAT", 0x0001), + ("SIGNED", 0x0002), + ("NOEXP", 0x0004), + ("PATH", 0x0008), + ("LOCALEALPHA", 0x0010), + ("LOCALENUM", 0x0020), + ("IGNORECASE", 0x0040), + ("LOWERCASEFIRST", 0x0080), + ("GROUPLETTERS", 0x0100), + ("UNGROUPLETTERS", 0x0200), + ("NANLAST", 0x0400), + ("COMPATIBILITYNORMALIZE", 0x0800), + ("NUMAFTER", 0x1000), + ("DEFAULT", 0x0000), + ("INT", 0x0000), + ("UNSIGNED", 0x0000), + ("REAL", 0x0003), + ("LOCALE", 0x0030), + ("I", 0x0000), + ("U", 0x0000), + ("F", 0x0001), + ("S", 0x0002), + ("R", 0x0003), + ("N", 0x0004), + ("P", 0x0008), + ("LA", 0x0010), + ("LN", 0x0020), + ("L", 0x0030), + ("IC", 0x0040), + ("LF", 0x0080), + ("G", 0x0100), + ("UG", 0x0200), + ("C", 0x0200), + ("CAPITALFIRST", 0x0200), + ("NL", 0x0400), + ("CN", 0x0800), + ("NA", 0x1000), + ] + assert list(ns._asdict().items()) == enum_name_values diff --git a/tests/test_parse_bytes_function.py b/tests/test_parse_bytes_function.py new file mode 100644 index 0000000..49f54ae --- /dev/null +++ b/tests/test_parse_bytes_function.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +import pytest +from hypothesis import given +from hypothesis.strategies import binary +from natsort.ns_enum import ns +from natsort.utils import parse_bytes_factory + + +@pytest.mark.parametrize( + "alg, example_func", + [ + (ns.DEFAULT, lambda x: (x,)), + (ns.IGNORECASE, lambda x: (x.lower(),)), + # With PATH, it becomes a tested tuple. + (ns.PATH, lambda x: ((x,),)), + (ns.PATH | ns.IGNORECASE, lambda x: ((x.lower(),),)), + ], +) +@given(x=binary()) +def test_parse_bytest_factory_makes_function_that_returns_tuple(x, alg, example_func): + parse_bytes_func = parse_bytes_factory(alg) + assert parse_bytes_func(x) == example_func(x) diff --git a/tests/test_parse_number_function.py b/tests/test_parse_number_function.py new file mode 100644 index 0000000..7f22ef4 --- /dev/null +++ b/tests/test_parse_number_function.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +import pytest +from hypothesis import given +from hypothesis.strategies import floats, integers +from natsort.ns_enum import ns +from natsort.utils import parse_number_factory + + +@pytest.mark.usefixtures("with_locale_en_us") +@pytest.mark.parametrize( + "alg, example_func", + [ + (ns.DEFAULT, lambda x: ("", x)), + (ns.PATH, lambda x: (("", x),)), + (ns.UNGROUPLETTERS | ns.LOCALE, lambda x: (("xx",), ("", x))), + (ns.PATH | ns.UNGROUPLETTERS | ns.LOCALE, lambda x: ((("xx",), ("", x)),)), + ], +) +@given(x=floats(allow_nan=False) | integers()) +def test_parse_number_factory_makes_function_that_returns_tuple(x, alg, example_func): + parse_number_func = parse_number_factory(alg, "", "xx") + assert parse_number_func(x) == example_func(x) + + +@pytest.mark.parametrize( + "alg, x, result", + [ + (ns.DEFAULT, 57, ("", 57)), + (ns.DEFAULT, float("nan"), ("", float("-inf"))), # NaN transformed to -infinity + (ns.NANLAST, float("nan"), ("", float("+inf"))), # NANLAST makes it +infinity + ], +) +def test_parse_number_factory_treats_nan_special(alg, x, result): + parse_number_func = parse_number_factory(alg, "", "xx") + assert parse_number_func(x) == result diff --git a/tests/test_parse_string_function.py b/tests/test_parse_string_function.py new file mode 100644 index 0000000..2c7729b --- /dev/null +++ b/tests/test_parse_string_function.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +import unicodedata + +import pytest +from hypothesis import given +from hypothesis.strategies import floats, integers, lists, text +from natsort.compat.fastnumbers import fast_float +from natsort.compat.py23 import py23_str +from natsort.ns_enum import NS_DUMB, ns +from natsort.utils import NumericalRegularExpressions as NumRegex +from natsort.utils import parse_string_factory + + +class CustomTuple(tuple): + """Used to ensure what is given during testing is what is returned.""" + + original = None + + +def input_transform(x): + """Make uppercase.""" + try: + return x.upper() + except AttributeError: + return x + + +def final_transform(x, original): + """Make the input a CustomTuple.""" + t = CustomTuple(x) + t.original = original + return t + + +@pytest.fixture +def parse_string_func(request): + """A parse_string_factory result with sample arguments.""" + sep = "" + return parse_string_factory( + request.param, # algorirhm + sep, + NumRegex.int_nosign().split, + input_transform, + fast_float, + final_transform, + ) + + +@pytest.mark.parametrize("parse_string_func", [ns.DEFAULT], indirect=True) +@given(x=floats() | integers()) +def test_parse_string_factory_raises_type_error_if_given_number(x, parse_string_func): + with pytest.raises(TypeError): + assert parse_string_func(x) + + +# noinspection PyCallingNonCallable +@pytest.mark.parametrize( + "parse_string_func, orig_func", + [ + (ns.DEFAULT, lambda x: x.upper()), + (ns.LOCALE, lambda x: x.upper()), + (ns.LOCALE | NS_DUMB, lambda x: x), # This changes the "original" handling. + ], + indirect=["parse_string_func"], +) +@given( + x=lists( + elements=floats(allow_nan=False) | text() | integers(), min_size=1, max_size=10 + ) +) +@pytest.mark.usefixtures("with_locale_en_us") +def test_parse_string_factory_invariance(x, parse_string_func, orig_func): + # parse_string_factory is the high-level combination of several dedicated + # functions involved in splitting and manipulating a string. The details of + # what those functions do is not relevant to testing parse_string_factory. + # What is relevant is that the form of the output matches the invariant + # that even elements are string and odd are numerical. That each component + # function is doing what it should is tested elsewhere. + value = "".join(map(py23_str, x)) # Convert the input to a single string. + result = parse_string_func(value) + result_types = list(map(type, result)) + expected_types = [py23_str if i % 2 == 0 else float for i in range(len(result))] + assert result_types == expected_types + + # The result is in our CustomTuple. + assert isinstance(result, CustomTuple) + + # Original should have gone through the "input_transform" + # which is uppercase in these tests. + assert result.original == orig_func(unicodedata.normalize("NFD", value)) diff --git a/tests/test_regex.py b/tests/test_regex.py new file mode 100644 index 0000000..d3fe617 --- /dev/null +++ b/tests/test_regex.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +"""These test the splitting regular expressions.""" +from __future__ import unicode_literals + +import pytest +from natsort.utils import NumericalRegularExpressions as NumRegex + + +regex_names = { + NumRegex.int_nosign(): "int_nosign", + NumRegex.int_sign(): "int_sign", + NumRegex.float_nosign_noexp(): "float_nosign_noexp", + NumRegex.float_sign_noexp(): "float_sign_noexp", + NumRegex.float_nosign_exp(): "float_nosign_exp", + NumRegex.float_sign_exp(): "float_sign_exp", +} + +# Regex Aliases (so lines stay a reasonable length. +i_u = NumRegex.int_nosign() +i_s = NumRegex.int_sign() +f_u = NumRegex.float_nosign_noexp() +f_s = NumRegex.float_sign_noexp() +f_ue = NumRegex.float_nosign_exp() +f_se = NumRegex.float_sign_exp() + +# Assemble a test suite of regular strings and their regular expression +# splitting result. Organize by the input string. +regex_tests = { + "-123.45e+67": { + i_u: ["-", "123", ".", "45", "e+", "67", ""], + i_s: ["", "-123", ".", "45", "e", "+67", ""], + f_u: ["-", "123.45", "e+", "67", ""], + f_s: ["", "-123.45", "e", "+67", ""], + f_ue: ["-", "123.45e+67", ""], + f_se: ["", "-123.45e+67", ""], + }, + "a-123.45e+67b": { + i_u: ["a-", "123", ".", "45", "e+", "67", "b"], + i_s: ["a", "-123", ".", "45", "e", "+67", "b"], + f_u: ["a-", "123.45", "e+", "67", "b"], + f_s: ["a", "-123.45", "e", "+67", "b"], + f_ue: ["a-", "123.45e+67", "b"], + f_se: ["a", "-123.45e+67", "b"], + }, + "hello": { + i_u: ["hello"], + i_s: ["hello"], + f_u: ["hello"], + f_s: ["hello"], + f_ue: ["hello"], + f_se: ["hello"], + }, + "abc12.34.56-7def": { + i_u: ["abc", "12", ".", "34", ".", "56", "-", "7", "def"], + i_s: ["abc", "12", ".", "34", ".", "56", "", "-7", "def"], + f_u: ["abc", "12.34", "", ".56", "-", "7", "def"], + f_s: ["abc", "12.34", "", ".56", "", "-7", "def"], + f_ue: ["abc", "12.34", "", ".56", "-", "7", "def"], + f_se: ["abc", "12.34", "", ".56", "", "-7", "def"], + }, + "a1b2c3d4e5e6": { + i_u: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], + i_s: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], + f_u: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], + f_s: ["a", "1", "b", "2", "c", "3", "d", "4", "e", "5", "e", "6", ""], + f_ue: ["a", "1", "b", "2", "c", "3", "d", "4e5", "e", "6", ""], + f_se: ["a", "1", "b", "2", "c", "3", "d", "4e5", "e", "6", ""], + }, + "eleven۱۱eleven11eleven১১": { # All of these are the decimal 11 + i_u: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], + i_s: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], + f_u: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], + f_s: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], + f_ue: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], + f_se: ["eleven", "۱۱", "eleven", "11", "eleven", "১১", ""], + }, + "12①②ⅠⅡ⅓": { # Two decimals, Two digits, Two numerals, fraction + i_u: ["", "12", "", "①", "", "②", "ⅠⅡ⅓"], + i_s: ["", "12", "", "①", "", "②", "ⅠⅡ⅓"], + f_u: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], + f_s: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], + f_ue: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], + f_se: ["", "12", "", "①", "", "②", "", "Ⅰ", "", "Ⅱ", "", "⅓", ""], + } +} + + +# From the above collections, create the parametrized tests and labels. +regex_params = [ + (given, expected, regex) + for given, values in regex_tests.items() + for regex, expected in values.items() +] +labels = ["{}-{}".format(given, regex_names[regex]) for given, _, regex in regex_params] + + +@pytest.mark.parametrize("x, expected, regex", regex_params, ids=labels) +def test_regex_splits_correctly(x, expected, regex): + # noinspection PyUnresolvedReferences + assert regex.split(x) == expected diff --git a/tests/test_string_component_transform_factory.py b/tests/test_string_component_transform_factory.py new file mode 100644 index 0000000..4754072 --- /dev/null +++ b/tests/test_string_component_transform_factory.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +from functools import partial + +import pytest +from hypothesis import example, given +from hypothesis.strategies import floats, integers, text +from natsort.compat.fastnumbers import fast_float, fast_int +from natsort.compat.locale import get_strxfrm +from natsort.compat.py23 import py23_range, py23_str, py23_unichr +from natsort.ns_enum import NS_DUMB, ns +from natsort.utils import groupletters, string_component_transform_factory + +# There are some unicode values that are known failures with the builtin locale +# library on BSD systems that has nothing to do with natsort (a ValueError is +# raised by strxfrm). Let's filter them out. +try: + bad_uni_chars = frozenset( + py23_unichr(x) for x in py23_range(0X10fefd, 0X10ffff + 1) + ) +except ValueError: + # Narrow unicode build... no worries. + bad_uni_chars = frozenset() + + +def no_bad_uni_chars(x, _bad_chars=bad_uni_chars): + """Ensure text does not contain bad unicode characters""" + return not any(y in _bad_chars for y in x) + + +def no_null(x): + """Ensure text does not contain a null character.""" + return "\0" not in x + + +@pytest.mark.parametrize( + "alg, example_func", + [ + (ns.INT, fast_int), + (ns.DEFAULT, fast_int), + (ns.FLOAT, partial(fast_float, nan=float("-inf"))), + (ns.FLOAT | ns.NANLAST, partial(fast_float, nan=float("+inf"))), + (ns.GROUPLETTERS, partial(fast_int, key=groupletters)), + (ns.LOCALE, partial(fast_int, key=lambda x: get_strxfrm()(x))), + ( + ns.GROUPLETTERS | ns.LOCALE, + partial(fast_int, key=lambda x: get_strxfrm()(groupletters(x))), + ), + ( + NS_DUMB | ns.LOCALE, + partial(fast_int, key=lambda x: get_strxfrm()(groupletters(x))), + ), + ( + ns.GROUPLETTERS | ns.LOCALE | ns.FLOAT | ns.NANLAST, + partial( + fast_float, + key=lambda x: get_strxfrm()(groupletters(x)), + nan=float("+inf"), + ), + ), + ], +) +@example(x=float("nan")) +@given( + x=integers() + | floats() + | text().filter(bool).filter(no_bad_uni_chars).filter(no_null) +) +@pytest.mark.usefixtures("with_locale_en_us") +def test_string_component_transform_factory(x, alg, example_func): + string_component_transform_func = string_component_transform_factory(alg) + try: + assert string_component_transform_func(py23_str(x)) == example_func(py23_str(x)) + except ValueError as e: # handle broken locale lib on BSD. + if "is not in range" not in str(e): + raise diff --git a/tests/test_unicode_numbers.py b/tests/test_unicode_numbers.py new file mode 100644 index 0000000..582a8f0 --- /dev/null +++ b/tests/test_unicode_numbers.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +"""\ +Test the Unicode numbers module. +""" +from __future__ import unicode_literals + +import unicodedata + +from natsort.compat.py23 import py23_range, py23_unichr +from natsort.unicode_numbers import ( + decimal_chars, + decimals, + digit_chars, + digits, + digits_no_decimals, + numeric, + numeric_chars, + numeric_hex, + numeric_no_decimals, +) + + +def test_numeric_chars_contains_only_valid_unicode_numeric_characters(): + for a in numeric_chars: + assert unicodedata.numeric(a, None) is not None + + +def test_digit_chars_contains_only_valid_unicode_digit_characters(): + for a in digit_chars: + assert unicodedata.digit(a, None) is not None + + +def test_decimal_chars_contains_only_valid_unicode_decimal_characters(): + for a in decimal_chars: + assert unicodedata.decimal(a, None) is not None + + +def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters(): + set_numeric_hex = set(numeric_hex) + set_numeric_chars = set(numeric_chars) + set_digit_chars = set(digit_chars) + set_decimal_chars = set(decimal_chars) + for i in py23_range(0X110000): + try: + a = py23_unichr(i) + except ValueError: + break + if a in "0123456789": + continue + if unicodedata.numeric(a, None) is not None: + assert i in set_numeric_hex + assert a in set_numeric_chars + if unicodedata.digit(a, None) is not None: + assert i in set_numeric_hex + assert a in set_digit_chars + if unicodedata.decimal(a, None) is not None: + assert i in set_numeric_hex + assert a in set_decimal_chars + + assert set_decimal_chars.isdisjoint(digits_no_decimals) + assert set_digit_chars.issuperset(digits_no_decimals) + + assert set_decimal_chars.isdisjoint(numeric_no_decimals) + assert set_numeric_chars.issuperset(numeric_no_decimals) + + +def test_combined_string_contains_all_characters_in_list(): + assert numeric == "".join(numeric_chars) + assert digits == "".join(digit_chars) + assert decimals == "".join(decimal_chars) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..5cd469a --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +"""These test the utils.py functions.""" +from __future__ import unicode_literals + +import pathlib +import string +from itertools import chain +from operator import neg as op_neg + +import pytest +from hypothesis import given +from hypothesis.strategies import integers, lists, sampled_from, text +from natsort import utils +from natsort.compat.py23 import py23_cmp, py23_int, py23_lower, py23_str +from natsort.ns_enum import ns + + +def test_do_decoding_decodes_bytes_string_to_unicode(): + assert type(utils.do_decoding(b"bytes", "ascii")) is py23_str + assert utils.do_decoding(b"bytes", "ascii") == "bytes" + assert utils.do_decoding(b"bytes", "ascii") == b"bytes".decode("ascii") + + +@pytest.mark.parametrize( + "alg, expected", + [ + (ns.I, utils.NumericalRegularExpressions.int_nosign()), + (ns.I | ns.N, utils.NumericalRegularExpressions.int_nosign()), + (ns.I | ns.S, utils.NumericalRegularExpressions.int_sign()), + (ns.I | ns.S | ns.N, utils.NumericalRegularExpressions.int_sign()), + (ns.F, utils.NumericalRegularExpressions.float_nosign_exp()), + (ns.F | ns.N, utils.NumericalRegularExpressions.float_nosign_noexp()), + (ns.F | ns.S, utils.NumericalRegularExpressions.float_sign_exp()), + (ns.F | ns.S | ns.N, utils.NumericalRegularExpressions.float_sign_noexp()), + ], +) +def test_regex_chooser_returns_correct_regular_expression_object(alg, expected): + assert utils.regex_chooser(alg).pattern == expected.pattern + + +@pytest.mark.parametrize( + "alg, value_or_alias", + [ + # Defaults + (ns.DEFAULT, 0), + (ns.INT, 0), + (ns.UNSIGNED, 0), + # Aliases + (ns.INT, ns.I), + (ns.UNSIGNED, ns.U), + (ns.FLOAT, ns.F), + (ns.SIGNED, ns.S), + (ns.NOEXP, ns.N), + (ns.PATH, ns.P), + (ns.LOCALEALPHA, ns.LA), + (ns.LOCALENUM, ns.LN), + (ns.LOCALE, ns.L), + (ns.IGNORECASE, ns.IC), + (ns.LOWERCASEFIRST, ns.LF), + (ns.GROUPLETTERS, ns.G), + (ns.UNGROUPLETTERS, ns.UG), + (ns.CAPITALFIRST, ns.C), + (ns.UNGROUPLETTERS, ns.CAPITALFIRST), + (ns.NANLAST, ns.NL), + (ns.COMPATIBILITYNORMALIZE, ns.CN), + (ns.NUMAFTER, ns.NA), + # Convenience + (ns.LOCALE, ns.LOCALEALPHA | ns.LOCALENUM), + (ns.REAL, ns.FLOAT | ns.SIGNED), + ], +) +def test_ns_enum_values_and_aliases(alg, value_or_alias): + assert alg == value_or_alias + + +def test_chain_functions_is_a_no_op_if_no_functions_are_given(): + x = 2345 + assert utils.chain_functions([])(x) is x + + +def test_chain_functions_does_one_function_if_one_function_is_given(): + x = "2345" + assert utils.chain_functions([len])(x) == 4 + + +def test_chain_functions_combines_functions_in_given_order(): + x = 2345 + assert utils.chain_functions([str, len, op_neg])(x) == -len(str(x)) + + +# Each test has an "example" version for demonstrative purposes, +# and a test that uses the hypothesis module. + + +def test_groupletters_returns_letters_with_lowercase_transform_of_letter_example(): + assert utils.groupletters("HELLO") == "hHeElLlLoO" + assert utils.groupletters("hello") == "hheelllloo" + + +@given(text().filter(bool)) +def test_groupletters_returns_letters_with_lowercase_transform_of_letter(x): + assert utils.groupletters(x) == "".join( + chain.from_iterable([py23_lower(y), y] for y in x) + ) + + +def test_sep_inserter_does_nothing_if_no_numbers_example(): + assert list(utils.sep_inserter(iter(["a", "b", "c"]), "")) == ["a", "b", "c"] + assert list(utils.sep_inserter(iter(["a"]), "")) == ["a"] + + +def test_sep_inserter_does_nothing_if_only_one_number_example(): + assert list(utils.sep_inserter(iter(["a", 5]), "")) == ["a", 5] + + +def test_sep_inserter_inserts_separator_string_between_two_numbers_example(): + assert list(utils.sep_inserter(iter([5, 9]), "")) == ["", 5, "", 9] + + +@given(lists(elements=text().filter(bool) | integers(), min_size=3)) +def test_sep_inserter_inserts_separator_between_two_numbers(x): + # Rather than just replicating the the results in a different + # algorithm, validate that the "shape" of the output is as expected. + result = list(utils.sep_inserter(iter(x), "")) + for i, pos in enumerate(result[1:-1], 1): + if pos == "": + assert isinstance(result[i - 1], py23_int) + assert isinstance(result[i + 1], py23_int) + + +def test_path_splitter_splits_path_string_by_separator_example(): + z = "/this/is/a/path" + assert tuple(utils.path_splitter(z)) == tuple(pathlib.Path(z).parts) + z = pathlib.Path("/this/is/a/path") + assert tuple(utils.path_splitter(z)) == tuple(pathlib.Path(z).parts) + + +@given(lists(sampled_from(string.ascii_letters), min_size=2).filter(all)) +def test_path_splitter_splits_path_string_by_separator(x): + z = py23_str(pathlib.Path(*x)) + assert tuple(utils.path_splitter(z)) == tuple(pathlib.Path(z).parts) + + +def test_path_splitter_splits_path_string_by_separator_and_removes_extension_example(): + z = "/this/is/a/path/file.exe" + y = tuple(pathlib.Path(z).parts) + assert tuple(utils.path_splitter(z)) == y[:-1] + ( + pathlib.Path(z).stem, + pathlib.Path(z).suffix, + ) + + +@given(lists(sampled_from(string.ascii_letters), min_size=3).filter(all)) +def test_path_splitter_splits_path_string_by_separator_and_removes_extension(x): + z = py23_str(pathlib.Path(*x[:-2])) + "." + x[-1] + y = tuple(pathlib.Path(z).parts) + assert tuple(utils.path_splitter(z)) == y[:-1] + ( + pathlib.Path(z).stem, + pathlib.Path(z).suffix, + ) + + +@given(integers()) +def test_py23_cmp(x): + assert py23_cmp(x, x) == 0 + assert py23_cmp(x, x + 1) < 0 + assert py23_cmp(x, x - 1) > 0 diff --git a/tox.ini b/tox.ini index 86df909..5bdc590 100644 --- a/tox.ini +++ b/tox.ini @@ -18,15 +18,14 @@ passenv = WITH_EXTRAS deps = - pipenv + -r dev-requirements.txt extras = {env:WITH_EXTRAS:} commands = - pipenv install --dev --skip-lock # Only run How It Works doctest on Python 3.6. - py36: {envpython} -m doctest -o IGNORE_EXCEPTION_DETAIL docs/source/howitworks.rst + py36: {envpython} -m doctest -o IGNORE_EXCEPTION_DETAIL docs/howitworks.rst # Other doctests are run for all pythons. - pytest README.rst docs/source/intro.rst docs/source/examples.rst + pytest README.rst docs/intro.rst docs/examples.rst pytest --doctest-modules {envsitepackagesdir}/natsort # Full test suite. Allow the user to pass command-line objects. pytest --tb=short --cov {envsitepackagesdir}/natsort --cov-report term-missing {posargs:} @@ -38,7 +37,14 @@ flake8-import-order flake8-bugbear pep8-naming -commands = flake8 + check-manifest + twine +commands = + {envpython} setup.py sdist bdist_wheel + flake8 + check-manifest --ignore ".github*,*.md,.coveragerc" + twine check dist/* +skip_install = true # Build documentation. [testenv:docs] @@ -47,13 +53,13 @@ sphinx_rtd_theme commands = {envpython} setup.py build_sphinx +skip_install = true # Release the code to PyPI [testenv:release] deps = twine - check-manifest commands = - check-manifest {envpython} setup.py sdist bdist_wheel twine upload dist/* +skip_install = true