New Upstream Snapshot - python-itemloaders
Ready changes
Summary
Merged new upstream version: 1.0.6+git20221129.1.94b8099 (was: 1.0.6).
Diff
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
deleted file mode 100644
index 2336baf..0000000
--- a/.bumpversion.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-[bumpversion]
-current_version = 1.0.6
-commit = True
-tag = True
-
-[bumpversion:file:setup.py]
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
deleted file mode 100644
index 85daa7d..0000000
--- a/.github/workflows/main.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: CI
-on:
-- pull_request
-- push
-jobs:
- tests:
- runs-on: ubuntu-latest
- strategy:
- matrix:
- include:
- - python-version: 3
- env:
- TOXENV: docs
- - python-version: 3.6
- env:
- TOXENV: py
- - python-version: 3.7
- env:
- TOXENV: py
- - python-version: 3.8
- env:
- TOXENV: py
- - python-version: 3.9
- env:
- TOXENV: py
- - python-version: pypy-3.7
- env:
- TOXENV: py
- - python-version: 3.9
- env:
- TOXENV: extra-deps
- - python-version: '3.10'
- env:
- TOXENV: py
- steps:
- - uses: actions/checkout@v2
- - name: Install system libraries
- if: contains(matrix.python-version, 'pypy')
- run: |
- sudo apt-get update
- sudo apt-get install libxml2-dev libxslt-dev
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install --upgrade tox codecov
- - name: Run tests
- env: ${{ matrix.env }}
- run: tox
- - name: Publish coverage data
- uses: codecov/codecov-action@v1
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
deleted file mode 100644
index 2ed218a..0000000
--- a/.github/workflows/publish.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: Publish on PyPI
-on:
- release:
- types: [created]
-jobs:
- publish:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v2
- - name: Set up Python
- uses: actions/setup-python@v2
- with:
- python-version: 3
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install --upgrade setuptools wheel twine
- - name: Build
- run: |
- python setup.py sdist bdist_wheel
- - name: Upload
- env:
- TWINE_USERNAME: __token__
- TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
- run: |
- twine upload dist/*
diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index d3a82c2..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,21 +0,0 @@
-/.vagrant
-/scrapy.iml
-*.pyc
-_trial_temp*
-dropin.cache
-docs/build
-*egg-info
-.tox
-venv
-.venv
-build
-dist
-.idea
-htmlcov/
-.coverage
-.pytest_cache/
-.coverage.*
-.cache/
-
-# Windows
-Thumbs.db
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..14a0f20
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,103 @@
+Metadata-Version: 2.1
+Name: itemloaders
+Version: 1.0.6
+Summary: Base library for scrapy's ItemLoader
+Home-page: https://github.com/scrapy/itemloaders
+Author: Zyte
+Author-email: opensource@zyte.com
+License: BSD
+Project-URL: Documentation, https://itemloaders.readthedocs.io/
+Project-URL: Source, https://github.com/scrapy/itemloaders
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Requires-Python: >=3.7
+Description-Content-Type: text/x-rst
+License-File: LICENSE
+
+===========
+itemloaders
+===========
+
+.. image:: https://img.shields.io/pypi/v/itemloaders.svg
+ :target: https://pypi.python.org/pypi/itemloaders
+ :alt: PyPI Version
+
+.. image:: https://img.shields.io/pypi/pyversions/itemloaders.svg
+ :target: https://pypi.python.org/pypi/itemloaders
+ :alt: Supported Python Versions
+
+.. image:: https://github.com/scrapy/itemloaders/workflows/CI/badge.svg?branch=master
+ :target: https://github.com/scrapy/itemloaders/actions?workflow=CI
+ :alt: CI Status
+
+.. image:: https://codecov.io/github/scrapy/itemloaders/coverage.svg?branch=master
+ :target: https://codecov.io/gh/scrapy/itemloaders
+ :alt: Coverage report
+
+.. image:: https://readthedocs.org/projects/itemloaders/badge/?version=latest
+ :target: https://itemloaders.readthedocs.io/en/latest/?badge=latest
+ :alt: Documentation Status
+
+
+``itemloaders`` is a library that helps you collect data from HTML and XML sources.
+
+It comes in handy to extract data from web pages, as it supports
+data extraction using CSS and XPath Selectors.
+
+It's specially useful when you need to standardize the data from many sources.
+For example, it allows you to have all your casting and parsing rules in a
+single place.
+
+Here is an example to get you started::
+
+ from itemloaders import ItemLoader
+ from parsel import Selector
+
+ html_data = '''
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Some random product page</title>
+ </head>
+ <body>
+ <div class="product_name">Some random product page</div>
+ <p id="price">$ 100.12</p>
+ </body>
+ </html>
+ '''
+ loader = ItemLoader(selector=Selector(html_data))
+ loader.add_xpath('name', '//div[@class="product_name"]/text()')
+ loader.add_xpath('name', '//div[@class="product_title"]/text()')
+ loader.add_css('price', '#price::text')
+ loader.add_value('last_updated', 'today') # you can also use literal values
+ item = loader.load_item()
+ item
+ # {'name': ['Some random product page'], 'price': ['$ 100.12'], 'last_updated': ['today']}
+
+For more information, check out the `documentation <https://itemloaders.readthedocs.io/en/latest/>`_.
+
+Contributing
+============
+
+All contributions are welcome!
+
+* If you want to review some code, check open
+ `Pull Requests here <https://github.com/scrapy/itemloaders/pulls>`_
+
+* If you want to submit a code change
+
+ * File an `issue here <https://github.com/scrapy/itemloaders/issues>`_, if there isn't one yet
+ * Fork this repository
+ * Create a branch to work on your changes
+ * Push your local branch and submit a Pull Request
diff --git a/debian/changelog b/debian/changelog
index fac8dd5..93dbb8f 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+python-itemloaders (1.0.6+git20221129.1.94b8099-1) UNRELEASED; urgency=low
+
+ * New upstream snapshot.
+
+ -- Debian Janitor <janitor@jelmer.uk> Wed, 08 Feb 2023 21:27:17 -0000
+
python-itemloaders (1.0.6-1) unstable; urgency=medium
* New upstream version.
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index ff68bf1..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,96 +0,0 @@
-#
-# Makefile for Scrapy documentation [based on Python documentation Makefile]
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-
-# You can set these variables from the command line.
-PYTHON = python
-SPHINXOPTS =
-PAPER =
-SOURCES =
-SHELL = /bin/bash
-
-ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees \
- -D latex_elements.papersize=$(PAPER) \
- $(SPHINXOPTS) . build/$(BUILDER) $(SOURCES)
-
-.PHONY: help update build html htmlhelp clean
-
-help:
- @echo "Please use \`make <target>' where <target> is one of"
- @echo " html to make standalone HTML files"
- @echo " htmlhelp to make HTML files and a HTML help project"
- @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
- @echo " text to make plain text files"
- @echo " changes to make an overview over all changed/added/deprecated items"
- @echo " linkcheck to check all external links for integrity"
- @echo " watch build HTML docs, open in browser and watch for changes"
-
-build-dirs:
- mkdir -p build/$(BUILDER) build/doctrees
-
-build: build-dirs
- sphinx-build $(ALLSPHINXOPTS)
- @echo
-
-build-ignore-errors: build-dirs
- -sphinx-build $(ALLSPHINXOPTS)
- @echo
-
-
-html: BUILDER = html
-html: build
- @echo "Build finished. The HTML pages are in build/html."
-
-htmlhelp: BUILDER = htmlhelp
-htmlhelp: build
- @echo "Build finished; now you can run HTML Help Workshop with the" \
- "build/htmlhelp/pydoc.hhp project file."
-
-latex: BUILDER = latex
-latex: build
- @echo "Build finished; the LaTeX files are in build/latex."
- @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
- "run these through (pdf)latex."
-
-text: BUILDER = text
-text: build
- @echo "Build finished; the text files are in build/text."
-
-changes: BUILDER = changes
-changes: build
- @echo "The overview file is in build/changes."
-
-linkcheck: BUILDER = linkcheck
-linkcheck: build
- @echo "Link check complete; look for any errors in the above output " \
- "or in build/$(BUILDER)/output.txt"
-
-linkfix: BUILDER = linkcheck
-linkfix: build-ignore-errors
- $(PYTHON) utils/linkfix.py
- @echo "Fixing redirecting links in docs has finished; check all " \
- "replacements before committing them"
-
-doctest: BUILDER = doctest
-doctest: build
- @echo "Testing of doctests in the sources finished, look at the " \
- "results in build/doctest/output.txt"
-
-pydoc-topics: BUILDER = pydoc-topics
-pydoc-topics: build
- @echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
- "into the Lib/ directory"
-
-coverage: BUILDER = coverage
-coverage: build
-
-htmlview: html
- $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
- os.path.realpath('build/html/index.html'))"
-
-clean:
- -rm -rf build/*
-
-watch: htmlview
- watchmedo shell-command -p '*.rst' -c 'make html' -R -D
diff --git a/docs/README.rst b/docs/README.rst
deleted file mode 100644
index 453858d..0000000
--- a/docs/README.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-:orphan:
-
-===========================================
-itemloaders documentation quick start guide
-===========================================
-
-This file provides a quick guide on how to compile the itemloaders documentation.
-
-
-Setup the environment
----------------------
-
-To compile the documentation you need Sphinx Python library. To install it
-and all its dependencies run the following command from this dir
-
-::
-
- pip install -r requirements.txt
-
-
-Compile the documentation
--------------------------
-
-To compile the documentation (to classic HTML output) run the following command
-from this dir::
-
- make html
-
-Documentation will be generated (in HTML format) inside the ``build/html`` dir.
-
-
-View the documentation
-----------------------
-
-To view the documentation run the following command::
-
- make htmlview
-
-This command will fire up your default browser and open the main page of your
-(previously generated) HTML documentation.
-
-
-Start over
-----------
-
-To cleanup all generated documentation files and start from scratch run::
-
- make clean
-
-Keep in mind that this command won't touch any documentation source files.
-
-
-Recreating documentation on the fly
------------------------------------
-
-There is a way to recreate the doc automatically when you make changes, you
-need to install watchdog (``pip install watchdog``) and then use::
-
- make watch
diff --git a/docs/_ext/github.py b/docs/_ext/github.py
deleted file mode 100644
index e1adcfc..0000000
--- a/docs/_ext/github.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from docutils import nodes
-from docutils.parsers.rst.roles import set_classes
-
-
-def setup(app):
- app.add_role('gh', github_role)
-
-
-def github_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
- if text.isdigit():
- display_text = f'#{text}'
- url = f'https://github.com/scrapy/itemloaders/issues/{text}'
- else:
- short_commit = text[:7]
- display_text = short_commit
- url = f'https://github.com/scrapy/itemloaders/commit/{short_commit}'
-
- set_classes(options)
- node = nodes.reference(rawtext, display_text, refuri=url, **options)
- return [node], []
diff --git a/docs/api-reference.rst b/docs/api-reference.rst
deleted file mode 100644
index 2a5722b..0000000
--- a/docs/api-reference.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-.. _api-reference:
-
-API Reference
-==================
-
-.. autoclass:: itemloaders.ItemLoader
- :members:
\ No newline at end of file
diff --git a/docs/built-in-processors.rst b/docs/built-in-processors.rst
deleted file mode 100644
index cba26d7..0000000
--- a/docs/built-in-processors.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _built-in-processors:
-
-Available built-in processors
-=============================
-
-Even though you can use any callable function as input and output processors,
-``itemloaders`` provides some commonly used processors, which are described
-below.
-
-Some of them, like the :class:`~itemloaders.processors.MapCompose` (which is
-typically used as input processor) compose the output of several functions
-executed in order, to produce the final parsed value.
-
-Here is a list of all built-in processors:
-
-.. automodule:: itemloaders.processors
- :members:
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index 9713489..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# Scrapy documentation build configuration file, created by
-# sphinx-quickstart on Mon Nov 24 12:02:52 2008.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# The contents of this file are pickled, so don't put values in the namespace
-# that aren't pickleable (module imports are okay, they're removed automatically).
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import sys
-from datetime import datetime
-from os import path
-
-# If your extensions are in another directory, add it here. If the directory
-# is relative to the documentation root, use os.path.abspath to make it
-# absolute, like shown here.
-sys.path.append(path.dirname(__file__))
-sys.path.insert(0, path.dirname(path.dirname(__file__)))
-
-# General configuration
-# ---------------------
-
-# Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = [
- '_ext.github',
- 'sphinx.ext.autodoc',
- 'sphinx.ext.coverage',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.viewcode',
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-#source_encoding = 'utf-8'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = 'itemloaders'
-copyright = '2020–{}, Zyte Group Ltd'.format(datetime.now().year)
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = ''
-release = ''
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-language = 'en'
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of documents that shouldn't be included in the build.
-#unused_docs = []
-
-exclude_patterns = ['build']
-
-# List of directories, relative to source directory, that shouldn't be searched
-# for source files.
-exclude_trees = ['.build']
-
-# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-
-# Options for HTML output
-# -----------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-html_theme = 'sphinx_rtd_theme'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further. For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-# Add path to the RTD explicitly to robustify builds (otherwise might
-# fail in a clean Debian build env)
-import sphinx_rtd_theme
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-
-# The style sheet to use for HTML and HTML Help pages. A file of that name
-# must exist either in Sphinx' static/ path, or in one of the custom paths
-# given in html_static_path.
-# html_style = 'scrapydoc.css'
-
-# The name for this set of Sphinx documents. If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-html_last_updated_fmt = '%b %d, %Y'
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_use_modindex = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, the reST sources are included in the HTML build as _sources/<name>.
-html_copy_source = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it. The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = ''
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'itemloadersdoc'
-
-
-# Options for LaTeX output
-# ------------------------
-
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, document class [howto/manual]).
-latex_documents = [
- ('index', 'itemloaders.tex', 'itemloaders Documentation', 'Zyte', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_use_modindex = True
-
-
-# autodocs
-
-
-def setup(app):
- app.connect('autodoc-skip-member', maybe_skip_member)
-
-
-def maybe_skip_member(app, what, name, obj, skip, options):
- if not skip:
- # autodocs was generating a text "alias of" for the following members
- # https://github.com/sphinx-doc/sphinx/issues/4422
- return name in {'default_item_class', 'default_selector_class'}
- return skip
-
-
-nitpicky = True
-
-intersphinx_mapping = {
- 'parsel': ('https://parsel.readthedocs.io/en/stable/', None),
- 'python': ('https://docs.python.org/3', None),
- 'scrapy': ('https://docs.scrapy.org/en/latest/', None),
- 'w3lib': ('https://w3lib.readthedocs.io/en/latest', None),
-}
diff --git a/docs/declaring-loaders.rst b/docs/declaring-loaders.rst
deleted file mode 100644
index c960a2c..0000000
--- a/docs/declaring-loaders.rst
+++ /dev/null
@@ -1,55 +0,0 @@
-.. currentmodule:: itemloaders
-
-.. _declaring-loaders:
-
-Declaring Item Loaders
-======================
-
-Item Loaders are declared by using a class definition syntax. Here is an example::
-
- from itemloaders import ItemLoader
- from itemloaders.processors import TakeFirst, MapCompose, Join
-
- class ProductLoader(ItemLoader):
-
- default_output_processor = TakeFirst()
-
- name_in = MapCompose(str.title)
- name_out = Join()
-
- # using a built-in processor
- price_in = MapCompose(str.strip)
-
- # using a function
- def price_out(self, values):
- return float(values[0])
-
- loader = ProductLoader()
- loader.add_value('name', 'plasma TV')
- loader.add_value('price', '999.98')
- loader.load_item()
- # {'name': 'Plasma Tv', 'price': 999.98}
-
-As you can see, input processors are declared using the ``_in`` suffix while
-output processors are declared using the ``_out`` suffix. And you can also
-declare a default input/output processors using the
-:attr:`ItemLoader.default_input_processor` and
-:attr:`ItemLoader.default_output_processor` attributes.
-
-The precedence order, for both input and output processors, is as follows:
-
-1. Item Loader field-specific attributes: ``field_in`` and ``field_out`` (most
- precedence)
-
-2. Field metadata (``input_processor`` and ``output_processor`` keys).
-
- Check out `itemadapter field metadata
- <https://github.com/scrapy/itemadapter#metadata-support>`_ for more
- information.
-
- .. versionadded:: 1.0.1
-
-3. Item Loader defaults: :meth:`ItemLoader.default_input_processor` and
- :meth:`ItemLoader.default_output_processor` (least precedence)
-
-See also: :ref:`extending-loaders`.
diff --git a/docs/extending-loaders.rst b/docs/extending-loaders.rst
deleted file mode 100644
index acf78d3..0000000
--- a/docs/extending-loaders.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-.. _extending-loaders:
-
-Reusing and extending Item Loaders
-==================================
-
-Item Loaders are designed to ease the maintenance burden of parsing rules,
-without losing flexibility and, at the same time, providing a convenient
-mechanism for extending and overriding them. For this reason Item Loaders
-support traditional Python class inheritance for dealing with differences
-in data schemas.
-
-Suppose, for example, that you get some particular product names enclosed in
-three dashes (e.g. ``---Plasma TV---``) and you don't want to end up with
-those dashes in the final product names.
-
-Here's how you can remove those dashes by reusing and extending the default
-Product Item Loader (``ProductLoader``)::
-
- from itemloaders.processors import MapCompose
- from myproject.loaders import ProductLoader
-
- def strip_dashes(x):
- return x.strip('-')
-
- class SiteSpecificLoader(ProductLoader):
- name_in = MapCompose(strip_dashes, ProductLoader.name_in)
-
-Another case where extending Item Loaders can be very helpful is when you have
-multiple source formats, for example XML and HTML. In the XML version you may
-want to remove ``CDATA`` occurrences. Here's an example of how to do it::
-
- from itemloaders.processors import MapCompose
- from myproject.ItemLoaders import ProductLoader
- from myproject.utils.xml import remove_cdata
-
- class XmlProductLoader(ProductLoader):
- name_in = MapCompose(remove_cdata, ProductLoader.name_in)
-
-And that's how you typically extend input/output processors.
-
-There are many other possible ways to extend, inherit and override your Item
-Loaders, and different Item Loaders hierarchies may fit better for different
-projects. ``itemloaders`` only provides the mechanism; it doesn't impose any specific
-organization of your Loaders collection - that's up to you and your project's
-needs.
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index db7aae6..0000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,104 +0,0 @@
-.. currentmodule:: itemloaders
-
-.. _topics-index:
-
-============
-itemloaders
-============
-
-``itemloaders`` provide a convenient mechanism for populating data records.
-Its design provides a flexible, efficient and easy mechanism
-for extending and overriding different field parsing rules, either by raw data,
-or by source format (HTML, XML, etc) without becoming a nightmare to maintain.
-
-To install ``itemloaders``, run::
-
- pip install itemloaders
-
-.. note:: Under the hood, ``itemloaders`` uses
- `itemadapter <https://github.com/scrapy/itemadapter>`_ as a common interface.
- This means you can use any of the types supported by ``itemadapter`` here.
-
-.. warning:: ``dataclasses`` and ``attrs`` support is still experimental.
- Please, refer to :attr:`~ItemLoader.default_item_class` in the
- :ref:`api-reference` for more information.
-
-
-Getting Started with ``itemloaders``
-====================================
-
-To use an Item Loader, you must first instantiate it. You can either
-instantiate it with a dict-like object (`item`) or without one, in
-which case an `item` is automatically instantiated in the Item Loader ``__init__`` method
-using the `item` class specified in the :attr:`ItemLoader.default_item_class`
-attribute.
-
-Then, you start collecting values into the Item Loader, typically using
-CSS or XPath Selectors. You can add more than one value to
-the same item field; the Item Loader will know how to "join" those values later
-using a proper processing function.
-
-.. note:: Collected data is stored internally as lists,
- allowing to add several values to the same field.
- If an ``item`` argument is passed when creating a loader,
- each of the item's values will be stored as-is if it's already
- an iterable, or wrapped with a list if it's a single value.
-
-Here is a typical Item Loader usage::
-
- from itemloaders import ItemLoader
- from parsel import Selector
-
- html_data = '''
- <!DOCTYPE html>
- <html>
- <head>
- <title>Some random product page</title>
- </head>
- <body>
- <div class="product_name">Some random product page</div>
- <p id="price">$ 100.12</p>
- </body>
- </html>
- '''
- l = ItemLoader(selector=Selector(html_data))
- l.add_xpath('name', '//div[@class="product_name"]/text()')
- l.add_xpath('name', '//div[@class="product_title"]/text()')
- l.add_css('price', '#price::text')
- l.add_value('last_updated', 'today') # you can also use literal values
- item = l.load_item()
- item
- # {'name': ['Some random product page'], 'price': ['$ 100.12'], 'last_updated': ['today']}
-
-By quickly looking at that code, we can see the ``name`` field is being
-extracted from two different XPath locations in the page:
-
-1. ``//div[@class="product_name"]``
-2. ``//div[@class="product_title"]``
-
-In other words, data is being collected by extracting it from two XPath
-locations, using the :meth:`~ItemLoader.add_xpath` method. This is the
-data that will be assigned to the ``name`` field later.
-
-Afterwards, similar calls are used for ``price`` field using a CSS selector with
-the :meth:`~ItemLoader.add_css` method, and finally the ``last_update`` field is
-populated directly with a literal value
-(``today``) using a different method: :meth:`~ItemLoader.add_value`.
-
-Finally, when all data is collected, the :meth:`ItemLoader.load_item` method is
-called which actually returns the item populated with the data
-previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
-:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
-
-Contents
---------
-
-.. toctree::
- declaring-loaders
- processors
- loaders-context
- nested-loaders
- extending-loaders
- built-in-processors
- api-reference
- release-notes
diff --git a/docs/loaders-context.rst b/docs/loaders-context.rst
deleted file mode 100644
index b472fae..0000000
--- a/docs/loaders-context.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-.. currentmodule:: itemloaders
-
-.. _loaders-context:
-
-Item Loader Context
-===================
-
-The Item Loader Context is a mechanism that allows to change the input/ouput processors behavior.
-It's just a ``dict`` of arbitrary key/values which is shared among all processors.
-By default, the context contains the ``selector`` and any other `keyword arguments`
-sent to the Loaders's ``__init__``.
-The context can be passed when declaring, instantiating or using Item Loader.
-
-For example, suppose you have a function ``parse_length`` which receives a text
-value and extracts a length from it::
-
- def parse_length(text, loader_context):
- unit = loader_context.get('unit', 'm')
- # ... length parsing code goes here ...
- return parsed_length
-
-By accepting a ``loader_context`` argument the function is explicitly telling
-the Item Loader that it's able to receive an Item Loader context, so the Item
-Loader passes the currently active context when calling it, and the processor
-function (``parse_length`` in this case) can thus use them.
-
-There are several ways to modify Item Loader context values:
-
-1. By modifying the currently active Item Loader context
- (:attr:`~ItemLoader.context` attribute)::
-
- loader = ItemLoader(product)
- loader.context['unit'] = 'cm'
-
-2. On Item Loader instantiation (the keyword arguments of Item Loader
- ``__init__`` method are stored in the Item Loader context)::
-
- loader = ItemLoader(product, unit='cm')
-
-3. On Item Loader declaration, for those input/output processors that support
- instantiating them with an Item Loader context. :class:`~processors.MapCompose` is one of
- them::
-
- class ProductLoader(ItemLoader):
- length_out = MapCompose(parse_length, unit='cm')
diff --git a/docs/nested-loaders.rst b/docs/nested-loaders.rst
deleted file mode 100644
index 4965d5f..0000000
--- a/docs/nested-loaders.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-.. _nested-loaders:
-
-Nested Loaders
-==============
-
-When parsing related values from a subsection of a document, it can be
-useful to create nested loaders. Imagine you're extracting details from
-a footer of a page that looks something like:
-
-Example::
-
- <footer>
- <a class="social" href="https://facebook.com/whatever">Like Us</a>
- <a class="social" href="https://twitter.com/whatever">Follow Us</a>
- <a class="email" href="mailto:whatever@example.com">Email Us</a>
- </footer>
-
-Without nested loaders, you need to specify the full xpath (or css) for each value
-that you wish to extract.
-
-Example::
-
- loader = ItemLoader()
- # load stuff not in the footer
- loader.add_xpath('social', '//footer/a[@class = "social"]/@href')
- loader.add_xpath('email', '//footer/a[@class = "email"]/@href')
- loader.load_item()
-
-Instead, you can create a nested loader with the footer selector and add values
-relative to the footer. The functionality is the same but you avoid repeating
-the footer selector.
-
-Example::
-
- loader = ItemLoader()
- # load stuff not in the footer
- footer_loader = loader.nested_xpath('//footer')
- footer_loader.add_xpath('social', 'a[@class = "social"]/@href')
- footer_loader.add_xpath('email', 'a[@class = "email"]/@href')
- # no need to call footer_loader.load_item()
- loader.load_item()
-
-You can nest loaders arbitrarily and they work with either xpath or css selectors.
-As a general guideline, use nested loaders when they make your code simpler but do
-not go overboard with nesting or your parser can become difficult to read.
diff --git a/docs/processors.rst b/docs/processors.rst
deleted file mode 100644
index 1456ac8..0000000
--- a/docs/processors.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-.. currentmodule:: itemloaders
-
-.. _processors:
-
-Input and Output processors
-===========================
-
-An Item Loader contains one input processor and one output processor for each
-(item) field. The input processor processes the extracted data as soon as it's
-received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css` or
-:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
-collected and kept inside the ItemLoader. After collecting all data, the
-:meth:`ItemLoader.load_item` method is called to populate and get the populated
-item object. That's when the output processor is
-called with the data previously collected (and processed using the input
-processor). The result of the output processor is the final value that gets
-assigned to the item.
-
-Let's see an example to illustrate how the input and output processors are
-called for a particular field (the same applies for any other field)::
-
- l = ItemLoader(selector=some_selector)
- l.add_xpath('name', xpath1) # (1)
- l.add_xpath('name', xpath2) # (2)
- l.add_css('name', css) # (3)
- l.add_value('name', 'test') # (4)
- return l.load_item() # (5)
-
-So what happens is:
-
-1. Data from ``xpath1`` is extracted, and passed through the *input processor* of
- the ``name`` field. The result of the input processor is collected and kept in
- the Item Loader (but not yet assigned to the item).
-
-2. Data from ``xpath2`` is extracted, and passed through the same *input
- processor* used in (1). The result of the input processor is appended to the
- data collected in (1) (if any).
-
-3. This case is similar to the previous ones, except that the data is extracted
- from the ``css`` CSS selector, and passed through the same *input
- processor* used in (1) and (2). The result of the input processor is appended to the
- data collected in (1) and (2) (if any).
-
-4. This case is also similar to the previous ones, except that the value to be
- collected is assigned directly, instead of being extracted from a XPath
- expression or a CSS selector.
- However, the value is still passed through the input processors. In this
- case, since the value is not iterable it is converted to an iterable of a
- single element before passing it to the input processor, because input
- processor always receive iterables.
-
-5. The data collected in steps (1), (2), (3) and (4) is passed through
- the *output processor* of the ``name`` field.
- The result of the output processor is the value assigned to the ``name``
- field in the item.
-
-It's worth noticing that processors are just callable objects, which are called
-with the data to be parsed, and return a parsed value. So you can use any
-function as input or output processor. The only requirement is that they must
-accept one (and only one) positional argument, which will be an iterable.
-
-.. note:: Both input and output processors must receive an iterable as their
- first argument. The output of those functions can be anything. The result of
- input processors will be appended to an internal list (in the Loader)
- containing the collected values (for that field). The result of the output
- processors is the value that will be finally assigned to the item.
-
-The other thing you need to keep in mind is that the values returned by input
-processors are collected internally (in lists) and then passed to output
-processors to populate the fields.
-
-Last, but not least, ``itemloaders`` comes with some :ref:`commonly used processors
-<built-in-processors>` built-in for convenience.
diff --git a/docs/release-notes.rst b/docs/release-notes.rst
deleted file mode 100644
index a62c601..0000000
--- a/docs/release-notes.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-.. currentmodule:: itemloaders
-
-.. _release-notes:
-
-Release notes
-=============
-
-.. _release-1.0.6:
-
-itemloaders 1.0.6 (2022-08-29)
-------------------------------
-
-Fixes a regression introduced in 1.0.5 that would cause the ``re`` parameter of
-:meth:`ItemLoader.add_xpath` and similar methods to be passed to lxml, which
-would trigger an exception when the value of ``re`` was a compiled pattern and
-not a string (:gh:`56`)
-
-.. _release-1.0.5:
-
-itemloaders 1.0.5 (2022-08-25)
-------------------------------
-
-- Allow additional args to be passed when calling :meth:`ItemLoader.add_xpath` (:gh:`48`)
-
-- Fixed missing space in an exception message (:gh:`47`)
-
-- Updated company name in author and copyright sections (:gh:`42`)
-
-- Added official support for Python 3.9 and improved PyPy compatibility (:gh:`44`)
-
-- Added official support for Python 3.10 (:gh:`53`)
-
-.. _release-1.0.4:
-
-itemloaders 1.0.4 (2020-11-12)
-------------------------------
-
-- When adding a :class:`scrapy.item.scrapy.Item` object as a value into an
- :class:`ItemLoader` object, that item is now added *as is*, instead of
- becoming a :class:`list` of keys from its :attr:`scrapy.item.scrapy.Item.fields`
- (:gh:`28`, :gh:`29`)
-
-- Increased test coverage (:gh:`27`)
-
-
-.. _release-1.0.3:
-
-itemloaders 1.0.3 (2020-09-09)
-------------------------------
-
-- Calls to :meth:`ItemLoader.get_output_value` no longer affect the output of
- :meth:`ItemLoader.load_item` (:gh:`21`, :gh:`22`)
-
-- Fixed some documentation links (:gh:`19`, :gh:`23`)
-
-- Fixed some test warnings (:gh:`24`)
-
-
-.. _release-1.0.2:
-
-itemloaders 1.0.2 (2020-08-05)
-------------------------------
-
-- Included the license file in the source releases (:gh:`13`)
-
-- Cleaned up some remnants of Python 2 (:gh:`16`, :gh:`17`)
-
-
-.. _release-1.0.1:
-
-itemloaders 1.0.1 (2020-07-02)
-------------------------------
-
-- Extended item type support to all item types supported by itemadapter_
- (:gh:`13`)
-
-- :ref:`Input and output processors <declaring-loaders>` defined in item
- field metadata are now taken into account (:gh:`13`)
-
-- Lowered some minimum dependency versions (:gh:`10`):
-
- - :doc:`parsel <parsel:index>`: 1.5.2 → 1.5.0
-
- - :doc:`w3lib <w3lib:index>`: 1.21.0 → 1.17.0
-
-- Improved the README file (:gh:`9`)
-
-- Improved continuous integration (:gh:`e62d95b`)
-
-
-.. _release-1.0.0:
-
-itemloaders 1.0.0 (2020-05-18)
-------------------------------
-
-Initial release, based on a part of the :doc:`Scrapy <scrapy:index>` code base.
-
-
-.. _itemadapter: https://github.com/scrapy/itemadapter#itemadapter
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 948842b..0000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Sphinx>=3.0
-sphinx_rtd_theme>=0.4
diff --git a/itemloaders.egg-info/PKG-INFO b/itemloaders.egg-info/PKG-INFO
new file mode 100644
index 0000000..14a0f20
--- /dev/null
+++ b/itemloaders.egg-info/PKG-INFO
@@ -0,0 +1,103 @@
+Metadata-Version: 2.1
+Name: itemloaders
+Version: 1.0.6
+Summary: Base library for scrapy's ItemLoader
+Home-page: https://github.com/scrapy/itemloaders
+Author: Zyte
+Author-email: opensource@zyte.com
+License: BSD
+Project-URL: Documentation, https://itemloaders.readthedocs.io/
+Project-URL: Source, https://github.com/scrapy/itemloaders
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Requires-Python: >=3.7
+Description-Content-Type: text/x-rst
+License-File: LICENSE
+
+===========
+itemloaders
+===========
+
+.. image:: https://img.shields.io/pypi/v/itemloaders.svg
+ :target: https://pypi.python.org/pypi/itemloaders
+ :alt: PyPI Version
+
+.. image:: https://img.shields.io/pypi/pyversions/itemloaders.svg
+ :target: https://pypi.python.org/pypi/itemloaders
+ :alt: Supported Python Versions
+
+.. image:: https://github.com/scrapy/itemloaders/workflows/CI/badge.svg?branch=master
+ :target: https://github.com/scrapy/itemloaders/actions?workflow=CI
+ :alt: CI Status
+
+.. image:: https://codecov.io/github/scrapy/itemloaders/coverage.svg?branch=master
+ :target: https://codecov.io/gh/scrapy/itemloaders
+ :alt: Coverage report
+
+.. image:: https://readthedocs.org/projects/itemloaders/badge/?version=latest
+ :target: https://itemloaders.readthedocs.io/en/latest/?badge=latest
+ :alt: Documentation Status
+
+
+``itemloaders`` is a library that helps you collect data from HTML and XML sources.
+
+It comes in handy to extract data from web pages, as it supports
+data extraction using CSS and XPath Selectors.
+
+It's specially useful when you need to standardize the data from many sources.
+For example, it allows you to have all your casting and parsing rules in a
+single place.
+
+Here is an example to get you started::
+
+ from itemloaders import ItemLoader
+ from parsel import Selector
+
+ html_data = '''
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Some random product page</title>
+ </head>
+ <body>
+ <div class="product_name">Some random product page</div>
+ <p id="price">$ 100.12</p>
+ </body>
+ </html>
+ '''
+ loader = ItemLoader(selector=Selector(html_data))
+ loader.add_xpath('name', '//div[@class="product_name"]/text()')
+ loader.add_xpath('name', '//div[@class="product_title"]/text()')
+ loader.add_css('price', '#price::text')
+ loader.add_value('last_updated', 'today') # you can also use literal values
+ item = loader.load_item()
+ item
+ # {'name': ['Some random product page'], 'price': ['$ 100.12'], 'last_updated': ['today']}
+
+For more information, check out the `documentation <https://itemloaders.readthedocs.io/en/latest/>`_.
+
+Contributing
+============
+
+All contributions are welcome!
+
+* If you want to review some code, check open
+ `Pull Requests here <https://github.com/scrapy/itemloaders/pulls>`_
+
+* If you want to submit a code change
+
+ * File an `issue here <https://github.com/scrapy/itemloaders/issues>`_, if there isn't one yet
+ * Fork this repository
+ * Create a branch to work on your changes
+ * Push your local branch and submit a Pull Request
diff --git a/itemloaders.egg-info/SOURCES.txt b/itemloaders.egg-info/SOURCES.txt
new file mode 100644
index 0000000..579cabd
--- /dev/null
+++ b/itemloaders.egg-info/SOURCES.txt
@@ -0,0 +1,15 @@
+LICENSE
+MANIFEST.in
+README.rst
+setup.cfg
+setup.py
+itemloaders/__init__.py
+itemloaders/common.py
+itemloaders/processors.py
+itemloaders/utils.py
+itemloaders.egg-info/PKG-INFO
+itemloaders.egg-info/SOURCES.txt
+itemloaders.egg-info/dependency_links.txt
+itemloaders.egg-info/not-zip-safe
+itemloaders.egg-info/requires.txt
+itemloaders.egg-info/top_level.txt
\ No newline at end of file
diff --git a/docs/_ext/__init__.py b/itemloaders.egg-info/dependency_links.txt
similarity index 100%
rename from docs/_ext/__init__.py
rename to itemloaders.egg-info/dependency_links.txt
diff --git a/itemloaders.egg-info/not-zip-safe b/itemloaders.egg-info/not-zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/itemloaders.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/itemloaders.egg-info/requires.txt b/itemloaders.egg-info/requires.txt
new file mode 100644
index 0000000..e4dd8b4
--- /dev/null
+++ b/itemloaders.egg-info/requires.txt
@@ -0,0 +1,4 @@
+itemadapter>=0.1.0
+jmespath>=0.9.5
+parsel>=1.5.0
+w3lib>=1.17.0
diff --git a/itemloaders.egg-info/top_level.txt b/itemloaders.egg-info/top_level.txt
new file mode 100644
index 0000000..b958ea5
--- /dev/null
+++ b/itemloaders.egg-info/top_level.txt
@@ -0,0 +1 @@
+itemloaders
diff --git a/itemloaders/utils.py b/itemloaders/utils.py
index e764926..c64c702 100644
--- a/itemloaders/utils.py
+++ b/itemloaders/utils.py
@@ -55,20 +55,3 @@ def get_func_args(func, stripself=False):
if stripself:
func_args.pop(0)
return func_args
-
-
-def _getargspec_py23(func):
- """_getargspec_py23(function) -> named tuple ArgSpec(args, varargs, keywords,
- defaults)
-
- Was identical to inspect.getargspec() in python2, but uses
- inspect.getfullargspec() for python3 behind the scenes to avoid
- DeprecationWarning.
-
- >>> def f(a, b=2, *ar, **kw):
- ... pass
-
- >>> _getargspec_py23(f)
- ArgSpec(args=['a', 'b'], varargs='ar', keywords='kw', defaults=(2,))
- """
- return inspect.ArgSpec(*inspect.getfullargspec(func)[:4])
diff --git a/requirements-dev.txt b/requirements-dev.txt
deleted file mode 100644
index c17e1cf..0000000
--- a/requirements-dev.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-w3lib>=1.21.0
-parsel>=1.5.2
-jmespath>=0.9.5
-itemadapter>=0.1.0
-
-pytest==5.4.1
-flake8==3.7.9
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index ab1cdc8..2437d70 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,4 +2,9 @@
ignore = E266, E501, W503
max-line-length = 100
select = B,C,E,F,W,T4,B9
-exclude = .git,__pycache__,.venv
\ No newline at end of file
+exclude = .git,__pycache__,.venv
+
+[egg_info]
+tag_build =
+tag_date = 0
+
diff --git a/setup.py b/setup.py
index 0e6a004..56135c1 100644
--- a/setup.py
+++ b/setup.py
@@ -27,15 +27,15 @@ setup(
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
+ 'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
],
- python_requires='>=3.6',
+ python_requires='>=3.7',
install_requires=[
# before updating these versions, be sure they are not higher than
# scrapy's requirements
diff --git a/tests/test_base_loader.py b/tests/test_base_loader.py
deleted file mode 100644
index b4ed396..0000000
--- a/tests/test_base_loader.py
+++ /dev/null
@@ -1,466 +0,0 @@
-from functools import partial
-import unittest
-
-from itemloaders import ItemLoader
-from itemloaders.processors import Compose, Identity, Join, MapCompose, TakeFirst
-
-
-class CustomItemLoader(ItemLoader):
- name_in = MapCompose(lambda v: v.title())
-
-
-class DefaultedItemLoader(ItemLoader):
- default_input_processor = MapCompose(lambda v: v[:-1])
-
-
-# test processors
-def processor_with_args(value, other=None, loader_context=None):
- if 'key' in loader_context:
- return loader_context['key']
- return value
-
-
-class BasicItemLoaderTest(unittest.TestCase):
-
- def test_load_item_using_default_loader(self):
- i = dict(summary='lala')
- il = ItemLoader(item=i)
- il.add_value('name', 'marta')
- item = il.load_item()
- assert item is i
- assert item['summary'] == ['lala']
- assert item['name'] == ['marta']
-
- def test_load_item_using_custom_loader(self):
- il = CustomItemLoader()
- il.add_value('name', 'marta')
- item = il.load_item()
- assert item['name'] == ['Marta']
-
- def test_load_item_ignore_none_field_values(self):
- def validate_sku(value):
- # Let's assume a SKU is only digits.
- if value.isdigit():
- return value
-
- class MyLoader(ItemLoader):
- name_out = Compose(lambda vs: vs[0]) # take first which allows empty values
- price_out = Compose(TakeFirst(), float)
- sku_out = Compose(TakeFirst(), validate_sku)
-
- valid_fragment = 'SKU: 1234'
- invalid_fragment = 'SKU: not available'
- sku_re = 'SKU: (.+)'
-
- il = MyLoader(item={})
- # Should not return "sku: None".
- il.add_value('sku', [invalid_fragment], re=sku_re)
- # Should not ignore empty values.
- il.add_value('name', '')
- il.add_value('price', ['0'])
- assert il.load_item() == {'name': '', 'price': 0.0}
-
- il.replace_value('sku', [valid_fragment], re=sku_re)
- self.assertEqual(il.load_item()['sku'], '1234')
-
- def test_self_referencing_loader(self):
- class MyLoader(ItemLoader):
- url_out = TakeFirst()
-
- def img_url_out(self, values):
- return (self.get_output_value('url') or '') + values[0]
-
- il = MyLoader(item={})
- il.add_value('url', 'http://example.com/')
- il.add_value('img_url', '1234.png')
- assert il.load_item() == {
- 'url': 'http://example.com/',
- 'img_url': 'http://example.com/1234.png',
- }
-
- il = MyLoader(item={})
- il.add_value('img_url', '1234.png')
- assert il.load_item() == {'img_url': '1234.png'}
-
- def test_add_value(self):
- il = CustomItemLoader()
- il.add_value('name', 'marta')
- assert il.get_collected_values('name') == ['Marta']
- assert il.get_output_value('name') == ['Marta']
-
- il.add_value('name', 'pepe')
- assert il.get_collected_values('name') == ['Marta', 'Pepe']
- assert il.get_output_value('name') == ['Marta', 'Pepe']
-
- # test add object value
- il.add_value('summary', {'key': 1})
- assert il.get_collected_values('summary') == [{'key': 1}]
-
- il.add_value(None, 'Jim', lambda x: {'name': x})
- assert il.get_collected_values('name') == ['Marta', 'Pepe', 'Jim']
-
- def test_add_zero(self):
- il = ItemLoader()
- il.add_value('name', 0)
- assert il.get_collected_values('name') == [0]
-
- def test_add_none(self):
- il = ItemLoader()
- il.add_value('name', None)
- assert il.get_collected_values('name') == []
-
- def test_replace_value(self):
- il = CustomItemLoader()
- il.replace_value('name', 'marta')
- self.assertEqual(il.get_collected_values('name'), ['Marta'])
- self.assertEqual(il.get_output_value('name'), ['Marta'])
- il.replace_value('name', 'pepe')
- self.assertEqual(il.get_collected_values('name'), ['Pepe'])
- self.assertEqual(il.get_output_value('name'), ['Pepe'])
-
- il.replace_value(None, 'Jim', lambda x: {'name': x})
- self.assertEqual(il.get_collected_values('name'), ['Jim'])
-
- def test_replace_value_none(self):
- il = CustomItemLoader()
- il.replace_value('name', None)
- self.assertEqual(il.get_collected_values('name'), [])
- il.replace_value('name', 'marta')
- self.assertEqual(il.get_collected_values('name'), ['Marta'])
- il.replace_value('name', None) # when replacing with `None` nothing should happen
- self.assertEqual(il.get_collected_values('name'), ['Marta'])
-
- def test_get_value(self):
- il = ItemLoader()
- self.assertEqual('FOO', il.get_value(['foo', 'bar'], TakeFirst(), str.upper))
- self.assertEqual(['foo', 'bar'], il.get_value(['name:foo', 'name:bar'], re='name:(.*)$'))
- self.assertEqual('foo', il.get_value(['name:foo', 'name:bar'], TakeFirst(), re='name:(.*)$'))
- self.assertEqual(None, il.get_value(['foo', 'bar'], TakeFirst(), re='name:(.*)$'))
- self.assertEqual(None, il.get_value(None, TakeFirst()))
-
- il.add_value('name', ['name:foo', 'name:bar'], TakeFirst(), re='name:(.*)$')
- self.assertEqual(['foo'], il.get_collected_values('name'))
- il.replace_value('name', 'name:bar', re='name:(.*)$')
- self.assertEqual(['bar'], il.get_collected_values('name'))
-
- def test_iter_on_input_processor_input(self):
- class NameFirstItemLoader(ItemLoader):
- name_in = TakeFirst()
-
- il = NameFirstItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_collected_values('name'), ['marta'])
- il = NameFirstItemLoader()
- il.add_value('name', ['marta', 'jose'])
- self.assertEqual(il.get_collected_values('name'), ['marta'])
-
- il = NameFirstItemLoader()
- il.replace_value('name', 'marta')
- self.assertEqual(il.get_collected_values('name'), ['marta'])
- il = NameFirstItemLoader()
- il.replace_value('name', ['marta', 'jose'])
- self.assertEqual(il.get_collected_values('name'), ['marta'])
-
- il = NameFirstItemLoader()
- il.add_value('name', 'marta')
- il.add_value('name', ['jose', 'pedro'])
- self.assertEqual(il.get_collected_values('name'), ['marta', 'jose'])
-
- def test_map_compose_filter(self):
- def filter_world(x):
- return None if x == 'world' else x
-
- proc = MapCompose(filter_world, str.upper)
- self.assertEqual(proc(['hello', 'world', 'this', 'is', 'scrapy']),
- ['HELLO', 'THIS', 'IS', 'SCRAPY'])
-
- def test_map_compose_filter_multil(self):
- class CustomItemLoader(ItemLoader):
- name_in = MapCompose(lambda v: v.title(), lambda v: v[:-1])
-
- il = CustomItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['Mart'])
- item = il.load_item()
- self.assertEqual(item['name'], ['Mart'])
-
- def test_default_input_processor(self):
- il = DefaultedItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['mart'])
-
- def test_inherited_default_input_processor(self):
- class InheritDefaultedItemLoader(DefaultedItemLoader):
- pass
-
- il = InheritDefaultedItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['mart'])
-
- def test_input_processor_inheritance(self):
- class ChildItemLoader(CustomItemLoader):
- url_in = MapCompose(lambda v: v.lower())
-
- il = ChildItemLoader()
- il.add_value('url', 'HTTP://scrapy.ORG')
- self.assertEqual(il.get_output_value('url'), ['http://scrapy.org'])
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['Marta'])
-
- class ChildChildItemLoader(ChildItemLoader):
- url_in = MapCompose(lambda v: v.upper())
- summary_in = MapCompose(lambda v: v)
-
- il = ChildChildItemLoader()
- il.add_value('url', 'http://scrapy.org')
- self.assertEqual(il.get_output_value('url'), ['HTTP://SCRAPY.ORG'])
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['Marta'])
-
- def test_empty_map_compose(self):
- class IdentityDefaultedItemLoader(DefaultedItemLoader):
- name_in = MapCompose()
-
- il = IdentityDefaultedItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['marta'])
-
- def test_identity_input_processor(self):
- class IdentityDefaultedItemLoader(DefaultedItemLoader):
- name_in = Identity()
-
- il = IdentityDefaultedItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['marta'])
-
- def test_extend_custom_input_processors(self):
- class ChildItemLoader(CustomItemLoader):
- name_in = MapCompose(CustomItemLoader.name_in, str.swapcase)
-
- il = ChildItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['mARTA'])
-
- def test_extend_default_input_processors(self):
- class ChildDefaultedItemLoader(DefaultedItemLoader):
- name_in = MapCompose(DefaultedItemLoader.default_input_processor, str.swapcase)
-
- il = ChildDefaultedItemLoader()
- il.add_value('name', 'marta')
- self.assertEqual(il.get_output_value('name'), ['MART'])
-
- def test_output_processor_using_function(self):
- il = CustomItemLoader()
- il.add_value('name', ['mar', 'ta'])
- self.assertEqual(il.get_output_value('name'), ['Mar', 'Ta'])
-
- class TakeFirstItemLoader(CustomItemLoader):
- name_out = u" ".join
-
- il = TakeFirstItemLoader()
- il.add_value('name', ['mar', 'ta'])
- self.assertEqual(il.get_output_value('name'), 'Mar Ta')
-
- def test_output_processor_error(self):
- class CustomItemLoader(ItemLoader):
- name_out = MapCompose(float)
-
- il = CustomItemLoader()
- il.add_value('name', ['$10'])
- try:
- float('$10')
- except Exception as e:
- expected_exc_str = str(e)
-
- exc = None
- try:
- il.load_item()
- except Exception as e:
- exc = e
- assert isinstance(exc, ValueError)
- s = str(exc)
- assert 'name' in s, s
- assert '$10' in s, s
- assert 'ValueError' in s, s
- assert expected_exc_str in s, s
-
- def test_output_processor_using_classes(self):
- il = CustomItemLoader()
- il.add_value('name', ['mar', 'ta'])
- self.assertEqual(il.get_output_value('name'), ['Mar', 'Ta'])
-
- class TakeFirstItemLoader(CustomItemLoader):
- name_out = Join()
-
- il = TakeFirstItemLoader()
- il.add_value('name', ['mar', 'ta'])
- self.assertEqual(il.get_output_value('name'), 'Mar Ta')
-
- class TakeFirstItemLoader(CustomItemLoader):
- name_out = Join("<br>")
-
- il = TakeFirstItemLoader()
- il.add_value('name', ['mar', 'ta'])
- self.assertEqual(il.get_output_value('name'), 'Mar<br>Ta')
-
- def test_default_output_processor(self):
- il = CustomItemLoader()
- il.add_value('name', ['mar', 'ta'])
- self.assertEqual(il.get_output_value('name'), ['Mar', 'Ta'])
-
- class LalaItemLoader(CustomItemLoader):
- default_output_processor = Identity()
-
- il = LalaItemLoader()
- il.add_value('name', ['mar', 'ta'])
- self.assertEqual(il.get_output_value('name'), ['Mar', 'Ta'])
-
- def test_loader_context_on_declaration(self):
- class ChildItemLoader(CustomItemLoader):
- url_in = MapCompose(processor_with_args, key='val')
-
- il = ChildItemLoader()
- il.add_value('url', 'text')
- self.assertEqual(il.get_output_value('url'), ['val'])
- il.replace_value('url', 'text2')
- self.assertEqual(il.get_output_value('url'), ['val'])
-
- def test_loader_context_on_instantiation(self):
- class ChildItemLoader(CustomItemLoader):
- url_in = MapCompose(processor_with_args)
-
- il = ChildItemLoader(key='val')
- il.add_value('url', 'text')
- self.assertEqual(il.get_output_value('url'), ['val'])
- il.replace_value('url', 'text2')
- self.assertEqual(il.get_output_value('url'), ['val'])
-
- def test_loader_context_on_assign(self):
- class ChildItemLoader(CustomItemLoader):
- url_in = MapCompose(processor_with_args)
-
- il = ChildItemLoader()
- il.context['key'] = 'val'
- il.add_value('url', 'text')
- self.assertEqual(il.get_output_value('url'), ['val'])
- il.replace_value('url', 'text2')
- self.assertEqual(il.get_output_value('url'), ['val'])
-
- def test_item_passed_to_input_processor_functions(self):
- def processor(value, loader_context):
- return loader_context['item']['name']
-
- class ChildItemLoader(CustomItemLoader):
- url_in = MapCompose(processor)
-
- it = dict(name='marta')
- il = ChildItemLoader(item=it)
- il.add_value('url', 'text')
- self.assertEqual(il.get_output_value('url'), ['marta'])
- il.replace_value('url', 'text2')
- self.assertEqual(il.get_output_value('url'), ['marta'])
-
- # def test_add_value_on_unknown_field(self):
- # il = CustomItemLoader()
- # self.assertRaises(KeyError, il.add_value, 'wrong_field', ['lala', 'lolo'])
-
- def test_compose_processor(self):
- class CustomItemLoader(ItemLoader):
- name_out = Compose(lambda v: v[0], lambda v: v.title(), lambda v: v[:-1])
-
- il = CustomItemLoader()
- il.add_value('name', ['marta', 'other'])
- self.assertEqual(il.get_output_value('name'), 'Mart')
- item = il.load_item()
- self.assertEqual(item['name'], 'Mart')
-
- def test_partial_processor(self):
- def join(values, sep=None, loader_context=None, ignored=None):
- if sep is not None:
- return sep.join(values)
- elif loader_context and 'sep' in loader_context:
- return loader_context['sep'].join(values)
- else:
- return ''.join(values)
-
- class CustomItemLoader(ItemLoader):
- name_out = Compose(partial(join, sep='+'))
- url_out = Compose(partial(join, loader_context={'sep': '.'}))
- summary_out = Compose(partial(join, ignored='foo'))
-
- il = CustomItemLoader()
- il.add_value('name', ['rabbit', 'hole'])
- il.add_value('url', ['rabbit', 'hole'])
- il.add_value('summary', ['rabbit', 'hole'])
- item = il.load_item()
- self.assertEqual(item['name'], 'rabbit+hole')
- self.assertEqual(item['url'], 'rabbit.hole')
- self.assertEqual(item['summary'], 'rabbithole')
-
- def test_error_input_processor(self):
- class CustomItemLoader(ItemLoader):
- name_in = MapCompose(float)
-
- il = CustomItemLoader()
- self.assertRaises(ValueError, il.add_value, 'name',
- ['marta', 'other'])
-
- def test_error_output_processor(self):
- class CustomItemLoader(ItemLoader):
- name_out = Compose(Join(), float)
-
- il = CustomItemLoader()
- il.add_value('name', 'marta')
- with self.assertRaises(ValueError):
- il.load_item()
-
- def test_error_processor_as_argument(self):
- il = CustomItemLoader()
- self.assertRaises(ValueError, il.add_value, 'name',
- ['marta', 'other'], Compose(float))
-
- def test_get_unset_value(self):
- loader = ItemLoader()
- self.assertEqual(loader.load_item(), {})
- self.assertEqual(loader.get_output_value('foo'), [])
- self.assertEqual(loader.load_item(), {})
-
-
-class BaseNoInputReprocessingLoader(ItemLoader):
- title_in = MapCompose(str.upper)
- title_out = TakeFirst()
-
-
-class NoInputReprocessingDictLoader(BaseNoInputReprocessingLoader):
- default_item_class = dict
-
-
-class NoInputReprocessingFromDictTest(unittest.TestCase):
- """
- Loaders initialized from loaded items must not reprocess fields (dict instances)
- """
- def test_avoid_reprocessing_with_initial_values_single(self):
- il = NoInputReprocessingDictLoader(item=dict(title='foo'))
- il_loaded = il.load_item()
- self.assertEqual(il_loaded, dict(title='foo'))
- self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='foo'))
-
- def test_avoid_reprocessing_with_initial_values_list(self):
- il = NoInputReprocessingDictLoader(item=dict(title=['foo', 'bar']))
- il_loaded = il.load_item()
- self.assertEqual(il_loaded, dict(title='foo'))
- self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='foo'))
-
- def test_avoid_reprocessing_without_initial_values_single(self):
- il = NoInputReprocessingDictLoader()
- il.add_value('title', 'foo')
- il_loaded = il.load_item()
- self.assertEqual(il_loaded, dict(title='FOO'))
- self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='FOO'))
-
- def test_avoid_reprocessing_without_initial_values_list(self):
- il = NoInputReprocessingDictLoader()
- il.add_value('title', ['foo', 'bar'])
- il_loaded = il.load_item()
- self.assertEqual(il_loaded, dict(title='FOO'))
- self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='FOO'))
diff --git a/tests/test_loader_initialization.py b/tests/test_loader_initialization.py
deleted file mode 100644
index 7e8d51a..0000000
--- a/tests/test_loader_initialization.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import unittest
-
-from itemloaders import ItemLoader
-
-
-class InitializationTestMixin:
-
- item_class = None
-
- def test_keep_single_value(self):
- """Loaded item should contain values from the initial item"""
- input_item = self.item_class(name='foo')
- il = ItemLoader(item=input_item)
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(dict(loaded_item), {'name': ['foo']})
-
- def test_keep_list(self):
- """Loaded item should contain values from the initial item"""
- input_item = self.item_class(name=['foo', 'bar'])
- il = ItemLoader(item=input_item)
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar']})
-
- def test_add_value_singlevalue_singlevalue(self):
- """Values added after initialization should be appended"""
- input_item = self.item_class(name='foo')
- il = ItemLoader(item=input_item)
- il.add_value('name', 'bar')
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar']})
-
- def test_add_value_singlevalue_list(self):
- """Values added after initialization should be appended"""
- input_item = self.item_class(name='foo')
- il = ItemLoader(item=input_item)
- il.add_value('name', ['item', 'loader'])
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(dict(loaded_item), {'name': ['foo', 'item', 'loader']})
-
- def test_add_value_list_singlevalue(self):
- """Values added after initialization should be appended"""
- input_item = self.item_class(name=['foo', 'bar'])
- il = ItemLoader(item=input_item)
- il.add_value('name', 'qwerty')
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'qwerty']})
-
- def test_add_value_list_list(self):
- """Values added after initialization should be appended"""
- input_item = self.item_class(name=['foo', 'bar'])
- il = ItemLoader(item=input_item)
- il.add_value('name', ['item', 'loader'])
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'item', 'loader']})
-
- def test_get_output_value_singlevalue(self):
- """Getting output value must not remove value from item"""
- input_item = self.item_class(name='foo')
- il = ItemLoader(item=input_item)
- self.assertEqual(il.get_output_value('name'), ['foo'])
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(loaded_item, dict({'name': ['foo']}))
-
- def test_get_output_value_list(self):
- """Getting output value must not remove value from item"""
- input_item = self.item_class(name=['foo', 'bar'])
- il = ItemLoader(item=input_item)
- self.assertEqual(il.get_output_value('name'), ['foo', 'bar'])
- loaded_item = il.load_item()
- self.assertIsInstance(loaded_item, self.item_class)
- self.assertEqual(loaded_item, dict({'name': ['foo', 'bar']}))
-
- def test_values_single(self):
- """Values from initial item must be added to loader._values"""
- input_item = self.item_class(name='foo')
- il = ItemLoader(item=input_item)
- self.assertEqual(il._values.get('name'), ['foo'])
-
- def test_values_list(self):
- """Values from initial item must be added to loader._values"""
- input_item = self.item_class(name=['foo', 'bar'])
- il = ItemLoader(item=input_item)
- self.assertEqual(il._values.get('name'), ['foo', 'bar'])
-
-
-class InitializationFromDictTest(InitializationTestMixin, unittest.TestCase):
- item_class = dict
diff --git a/tests/test_nested_items.py b/tests/test_nested_items.py
deleted file mode 100644
index 0bdfbf2..0000000
--- a/tests/test_nested_items.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import unittest
-
-from itemloaders import ItemLoader
-
-
-class NestedItemTest(unittest.TestCase):
- """Test that adding items as values works as expected."""
-
- def _test_item(self, item):
- il = ItemLoader()
- il.add_value('item_list', item)
- self.assertEqual(il.load_item(), {'item_list': [item]})
-
- def test_attrs(self):
- try:
- import attr
- except ImportError:
- self.skipTest("Cannot import attr")
-
- @attr.s
- class TestItem:
- foo = attr.ib()
-
- self._test_item(TestItem(foo='bar'))
-
- def test_dataclass(self):
- try:
- from dataclasses import dataclass
- except ImportError:
- self.skipTest("Cannot import dataclasses.dataclass")
-
- @dataclass
- class TestItem:
- foo: str
-
- self._test_item(TestItem(foo='bar'))
-
- def test_dict(self):
- self._test_item({'foo': 'bar'})
-
- def test_scrapy_item(self):
- try:
- from scrapy import Field, Item
- except ImportError:
- self.skipTest("Cannot import Field or Item from scrapy")
-
- class TestItem(Item):
- foo = Field()
-
- self._test_item(TestItem(foo='bar'))
diff --git a/tests/test_nested_loader.py b/tests/test_nested_loader.py
deleted file mode 100644
index 1e193d3..0000000
--- a/tests/test_nested_loader.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import unittest
-
-from parsel import Selector
-
-from itemloaders import ItemLoader
-
-
-class SubselectorLoaderTest(unittest.TestCase):
- selector = Selector(text="""
- <html>
- <body>
- <header>
- <div id="id">marta</div>
- <p>paragraph</p>
- </header>
- <footer class="footer">
- <a href="http://www.scrapy.org">homepage</a>
- <img src="/images/logo.png" width="244" height="65" alt="Scrapy">
- </footer>
- </body>
- </html>
- """)
-
- def test_nested_xpath(self):
- loader = ItemLoader(selector=self.selector)
- nl = loader.nested_xpath("//header")
- nl.add_xpath('name', 'div/text()')
- nl.add_css('name_div', '#id')
- nl.add_value('name_value', nl.selector.xpath('div[@id = "id"]/text()').getall())
-
- self.assertEqual(loader.get_output_value('name'), ['marta'])
- self.assertEqual(loader.get_output_value('name_div'), ['<div id="id">marta</div>'])
- self.assertEqual(loader.get_output_value('name_value'), ['marta'])
-
- self.assertEqual(loader.get_output_value('name'), nl.get_output_value('name'))
- self.assertEqual(loader.get_output_value('name_div'), nl.get_output_value('name_div'))
- self.assertEqual(loader.get_output_value('name_value'), nl.get_output_value('name_value'))
-
- def test_nested_css(self):
- loader = ItemLoader(selector=self.selector)
- nl = loader.nested_css("header")
- nl.add_xpath('name', 'div/text()')
- nl.add_css('name_div', '#id')
- nl.add_value('name_value', nl.selector.xpath('div[@id = "id"]/text()').getall())
-
- self.assertEqual(loader.get_output_value('name'), ['marta'])
- self.assertEqual(loader.get_output_value('name_div'), ['<div id="id">marta</div>'])
- self.assertEqual(loader.get_output_value('name_value'), ['marta'])
-
- self.assertEqual(loader.get_output_value('name'), nl.get_output_value('name'))
- self.assertEqual(loader.get_output_value('name_div'), nl.get_output_value('name_div'))
- self.assertEqual(loader.get_output_value('name_value'), nl.get_output_value('name_value'))
-
- def test_nested_replace(self):
- loader = ItemLoader(selector=self.selector)
- nl1 = loader.nested_xpath('//footer')
- nl2 = nl1.nested_xpath('a')
-
- loader.add_xpath('url', '//footer/a/@href')
- self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org'])
- nl1.replace_xpath('url', 'img/@src')
- self.assertEqual(loader.get_output_value('url'), ['/images/logo.png'])
- nl2.replace_xpath('url', '@href')
- self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org'])
-
- def test_nested_ordering(self):
- loader = ItemLoader(selector=self.selector)
- nl1 = loader.nested_xpath('//footer')
- nl2 = nl1.nested_xpath('a')
-
- nl1.add_xpath('url', 'img/@src')
- loader.add_xpath('url', '//footer/a/@href')
- nl2.add_xpath('url', 'text()')
- loader.add_xpath('url', '//footer/a/@href')
-
- self.assertEqual(loader.get_output_value('url'), [
- '/images/logo.png',
- 'http://www.scrapy.org',
- 'homepage',
- 'http://www.scrapy.org',
- ])
-
- def test_nested_load_item(self):
- loader = ItemLoader(selector=self.selector)
- nl1 = loader.nested_xpath('//footer')
- nl2 = nl1.nested_xpath('img')
-
- loader.add_xpath('name', '//header/div/text()')
- nl1.add_xpath('url', 'a/@href')
- nl2.add_xpath('image', '@src')
-
- item = loader.load_item()
-
- assert item is loader.item
- assert item is nl1.item
- assert item is nl2.item
-
- self.assertEqual(item['name'], ['marta'])
- self.assertEqual(item['url'], ['http://www.scrapy.org'])
- self.assertEqual(item['image'], ['/images/logo.png'])
diff --git a/tests/test_output_processor.py b/tests/test_output_processor.py
deleted file mode 100644
index 54bb1fe..0000000
--- a/tests/test_output_processor.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import unittest
-
-from itemloaders import ItemLoader
-from itemloaders.processors import Identity, Compose, TakeFirst
-
-
-class TestOutputProcessorDict(unittest.TestCase):
- def test_output_processor(self):
-
- class TempDict(dict):
- def __init__(self, *args, **kwargs):
- super(TempDict, self).__init__(self, *args, **kwargs)
- self.setdefault('temp', 0.3)
-
- class TempLoader(ItemLoader):
- default_item_class = TempDict
- default_input_processor = Identity()
- default_output_processor = Compose(TakeFirst())
-
- loader = TempLoader()
- item = loader.load_item()
- self.assertIsInstance(item, TempDict)
- self.assertEqual(dict(item), {'temp': 0.3})
-
-
-class TestOutputProcessorItem(unittest.TestCase):
- def test_output_processor(self):
- class TempLoader(ItemLoader):
- default_input_processor = Identity()
- default_output_processor = Compose(TakeFirst())
-
- item = dict()
- item.setdefault('temp', 0.3)
- loader = TempLoader(item=item)
- item = loader.load_item()
- self.assertIsInstance(item, dict)
- self.assertEqual(dict(item), {'temp': 0.3})
diff --git a/tests/test_processors.py b/tests/test_processors.py
deleted file mode 100644
index 769597d..0000000
--- a/tests/test_processors.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import unittest
-
-from itemloaders.processors import (Compose, Identity, Join,
- MapCompose, TakeFirst)
-
-
-class ProcessorsTest(unittest.TestCase):
-
- def test_take_first(self):
- proc = TakeFirst()
- self.assertEqual(proc([None, '', 'hello', 'world']), 'hello')
- self.assertEqual(proc([None, '', 0, 'hello', 'world']), 0)
-
- def test_identity(self):
- proc = Identity()
- self.assertEqual(proc([None, '', 'hello', 'world']),
- [None, '', 'hello', 'world'])
-
- def test_join(self):
- proc = Join()
- self.assertRaises(TypeError, proc, [None, '', 'hello', 'world'])
- self.assertEqual(proc(['', 'hello', 'world']), u' hello world')
- self.assertEqual(proc(['hello', 'world']), u'hello world')
- self.assertIsInstance(proc(['hello', 'world']), str)
-
- def test_compose(self):
- proc = Compose(lambda v: v[0], str.upper)
- self.assertEqual(proc(['hello', 'world']), 'HELLO')
- proc = Compose(str.upper)
- self.assertEqual(proc(None), None)
- proc = Compose(str.upper, stop_on_none=False)
- self.assertRaises(ValueError, proc, None)
- proc = Compose(str.upper, lambda x: x + 1)
- self.assertRaises(ValueError, proc, 'hello')
-
- def test_mapcompose(self):
- def filter_world(x):
- return None if x == 'world' else x
- proc = MapCompose(filter_world, str.upper)
- self.assertEqual(proc([u'hello', u'world', u'this', u'is', u'scrapy']),
- [u'HELLO', u'THIS', u'IS', u'SCRAPY'])
- proc = MapCompose(filter_world, str.upper)
- self.assertEqual(proc(None), [])
- proc = MapCompose(filter_world, str.upper)
- self.assertRaises(ValueError, proc, [1])
- proc = MapCompose(filter_world, lambda x: x + 1)
- self.assertRaises(ValueError, proc, 'hello')
diff --git a/tests/test_select_jmes.py b/tests/test_select_jmes.py
deleted file mode 100644
index d3c8cc7..0000000
--- a/tests/test_select_jmes.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import unittest
-
-from itemloaders.processors import SelectJmes
-
-
-class SelectJmesTestCase(unittest.TestCase):
- test_list_equals = {
- 'simple': ('foo.bar', {"foo": {"bar": "baz"}}, "baz"),
- 'invalid': ('foo.bar.baz', {"foo": {"bar": "baz"}}, None),
- 'top_level': ('foo', {"foo": {"bar": "baz"}}, {"bar": "baz"}),
- 'double_vs_single_quote_string': ('foo.bar', {"foo": {"bar": "baz"}}, "baz"),
- 'dict': (
- 'foo.bar[*].name',
- {"foo": {"bar": [{"name": "one"}, {"name": "two"}]}},
- ['one', 'two']
- ),
- 'list': ('[1]', [1, 2], 2)
- }
-
- def test_output(self):
- for l in self.test_list_equals:
- expr, test_list, expected = self.test_list_equals[l]
- test = SelectJmes(expr)(test_list)
- self.assertEqual(
- test,
- expected,
- msg='test "{}" got {} expected {}'.format(l, test, expected)
- )
diff --git a/tests/test_selector_loader.py b/tests/test_selector_loader.py
deleted file mode 100644
index 170b56f..0000000
--- a/tests/test_selector_loader.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import re
-import unittest
-
-from parsel import Selector
-
-from itemloaders import ItemLoader
-from itemloaders.processors import MapCompose, TakeFirst
-
-
-class CustomItemLoader(ItemLoader):
- name_in = MapCompose(lambda v: v.title())
-
-
-class SelectortemLoaderTest(unittest.TestCase):
- selector = Selector(text="""
- <html>
- <body>
- <div id="id">marta</div>
- <p>paragraph</p>
- <a href="http://www.scrapy.org">homepage</a>
- <img src="/images/logo.png" width="244" height="65" alt="Scrapy">
- </body>
- </html>
- """)
-
- def test_init_method(self):
- loader = CustomItemLoader()
- self.assertEqual(loader.selector, None)
-
- def test_init_method_errors(self):
- loader = CustomItemLoader()
- self.assertRaises(RuntimeError, loader.add_xpath, 'url', '//a/@href')
- self.assertRaises(RuntimeError, loader.replace_xpath, 'url', '//a/@href')
- self.assertRaises(RuntimeError, loader.get_xpath, '//a/@href')
- self.assertRaises(RuntimeError, loader.add_css, 'name', '#name::text')
- self.assertRaises(RuntimeError, loader.replace_css, 'name', '#name::text')
- self.assertRaises(RuntimeError, loader.get_css, '#name::text')
-
- def test_init_method_with_selector(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertTrue(loader.selector)
-
- loader.add_xpath('name', '//div/text()')
- self.assertEqual(loader.get_output_value('name'), ['Marta'])
-
- def test_init_method_with_selector_css(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertTrue(loader.selector)
-
- loader.add_css('name', 'div::text')
- self.assertEqual(loader.get_output_value('name'), [u'Marta'])
-
- loader.add_css('url', 'a::attr(href)')
- self.assertEqual(loader.get_output_value('url'), [u'http://www.scrapy.org'])
-
- # combining/accumulating CSS selectors and XPath expressions
- loader.add_xpath('name', '//div/text()')
- self.assertEqual(loader.get_output_value('name'), [u'Marta', u'Marta'])
-
- loader.add_xpath('url', '//img/@src')
- self.assertEqual(loader.get_output_value('url'), [u'http://www.scrapy.org', u'/images/logo.png'])
-
- def test_add_xpath_re(self):
- loader = CustomItemLoader(selector=self.selector)
- loader.add_xpath('name', '//div/text()', re='ma')
- self.assertEqual(loader.get_output_value('name'), ['Ma'])
-
- loader = CustomItemLoader(selector=self.selector)
- loader.add_xpath('name', '//div/text()', re=re.compile('ma'))
- self.assertEqual(loader.get_output_value('name'), ['Ma'])
-
-
- def test_add_xpath_variables(self):
- loader = CustomItemLoader(selector=self.selector)
- loader.add_xpath('name', 'id($id)/text()', id="id")
- self.assertEqual(loader.get_output_value('name'), ['Marta'])
- loader = CustomItemLoader(selector=self.selector)
- loader.add_xpath('name', 'id($id)/text()', id="id2")
- self.assertEqual(loader.get_output_value('name'), [])
-
- def test_replace_xpath(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertTrue(loader.selector)
- loader.add_xpath('name', '//div/text()')
- self.assertEqual(loader.get_output_value('name'), ['Marta'])
- loader.replace_xpath('name', '//p/text()')
- self.assertEqual(loader.get_output_value('name'), ['Paragraph'])
-
- loader.replace_xpath('name', ['//p/text()', '//div/text()'])
- self.assertEqual(loader.get_output_value('name'), ['Paragraph', 'Marta'])
-
- def test_get_xpath(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertEqual(loader.get_xpath('//p/text()'), ['paragraph'])
- self.assertEqual(loader.get_xpath('//p/text()', TakeFirst()), 'paragraph')
- self.assertEqual(loader.get_xpath('//p/text()', TakeFirst(), re='pa'), 'pa')
-
- self.assertEqual(loader.get_xpath(['//p/text()', '//div/text()']), ['paragraph', 'marta'])
-
- def test_replace_xpath_multi_fields(self):
- loader = CustomItemLoader(selector=self.selector)
- loader.add_xpath(None, '//div/text()', TakeFirst(), lambda x: {'name': x})
- self.assertEqual(loader.get_output_value('name'), ['Marta'])
- loader.replace_xpath(None, '//p/text()', TakeFirst(), lambda x: {'name': x})
- self.assertEqual(loader.get_output_value('name'), ['Paragraph'])
-
- def test_replace_xpath_re(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertTrue(loader.selector)
- loader.add_xpath('name', '//div/text()')
- self.assertEqual(loader.get_output_value('name'), ['Marta'])
- loader.replace_xpath('name', '//div/text()', re='ma')
- self.assertEqual(loader.get_output_value('name'), ['Ma'])
-
- def test_add_css_re(self):
- loader = CustomItemLoader(selector=self.selector)
- loader.add_css('name', 'div::text', re='ma')
- self.assertEqual(loader.get_output_value('name'), ['Ma'])
-
- loader.add_css('url', 'a::attr(href)', re='http://(.+)')
- self.assertEqual(loader.get_output_value('url'), ['www.scrapy.org'])
-
- loader = CustomItemLoader(selector=self.selector)
- loader.add_css('name', 'div::text', re=re.compile('ma'))
- self.assertEqual(loader.get_output_value('name'), ['Ma'])
-
- loader.add_css('url', 'a::attr(href)', re=re.compile('http://(.+)'))
- self.assertEqual(loader.get_output_value('url'), ['www.scrapy.org'])
-
- def test_replace_css(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertTrue(loader.selector)
- loader.add_css('name', 'div::text')
- self.assertEqual(loader.get_output_value('name'), ['Marta'])
- loader.replace_css('name', 'p::text')
- self.assertEqual(loader.get_output_value('name'), ['Paragraph'])
-
- loader.replace_css('name', ['p::text', 'div::text'])
- self.assertEqual(loader.get_output_value('name'), ['Paragraph', 'Marta'])
-
- loader.add_css('url', 'a::attr(href)', re='http://(.+)')
- self.assertEqual(loader.get_output_value('url'), ['www.scrapy.org'])
- loader.replace_css('url', 'img::attr(src)')
- self.assertEqual(loader.get_output_value('url'), ['/images/logo.png'])
-
- def test_get_css(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertEqual(loader.get_css('p::text'), [u'paragraph'])
- self.assertEqual(loader.get_css('p::text', TakeFirst()), 'paragraph')
- self.assertEqual(loader.get_css('p::text', TakeFirst(), re='pa'), u'pa')
-
- self.assertEqual(loader.get_css(['p::text', 'div::text']), ['paragraph', 'marta'])
- self.assertEqual(loader.get_css(['a::attr(href)', 'img::attr(src)']),
- [u'http://www.scrapy.org', '/images/logo.png'])
-
- def test_replace_css_multi_fields(self):
- loader = CustomItemLoader(selector=self.selector)
- loader.add_css(None, 'div::text', TakeFirst(), lambda x: {'name': x})
- self.assertEqual(loader.get_output_value('name'), ['Marta'])
- loader.replace_css(None, 'p::text', TakeFirst(), lambda x: {'name': x})
- self.assertEqual(loader.get_output_value('name'), ['Paragraph'])
-
- loader.add_css(None, 'a::attr(href)', TakeFirst(), lambda x: {'url': x})
- self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org'])
- loader.replace_css(None, 'img::attr(src)', TakeFirst(), lambda x: {'url': x})
- self.assertEqual(loader.get_output_value('url'), ['/images/logo.png'])
-
- def test_replace_css_re(self):
- loader = CustomItemLoader(selector=self.selector)
- self.assertTrue(loader.selector)
- loader.add_css('url', 'a::attr(href)')
- self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org'])
- loader.replace_css('url', 'a::attr(href)', re=r'http://www\.(.+)')
- self.assertEqual(loader.get_output_value('url'), ['scrapy.org'])
diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py
deleted file mode 100644
index 36f7c80..0000000
--- a/tests/test_utils_misc.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import unittest
-
-from itemloaders.utils import arg_to_iter
-
-
-class UtilsMiscTestCase(unittest.TestCase):
-
- def test_arg_to_iter(self):
- assert hasattr(arg_to_iter(None), '__iter__')
- assert hasattr(arg_to_iter(100), '__iter__')
- assert hasattr(arg_to_iter('lala'), '__iter__')
- assert hasattr(arg_to_iter([1, 2, 3]), '__iter__')
- assert hasattr(arg_to_iter(l for l in 'abcd'), '__iter__')
-
- self.assertEqual(list(arg_to_iter(None)), [])
- self.assertEqual(list(arg_to_iter('lala')), ['lala'])
- self.assertEqual(list(arg_to_iter(100)), [100])
- self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
- self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3])
- self.assertEqual(list(arg_to_iter({'a': 1})), [{'a': 1}])
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/test_utils_python.py b/tests/test_utils_python.py
deleted file mode 100644
index 0547c95..0000000
--- a/tests/test_utils_python.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import functools
-import operator
-import platform
-import unittest
-from datetime import datetime
-
-from itemloaders.utils import get_func_args
-
-
-class UtilsPythonTestCase(unittest.TestCase):
-
- def test_get_func_args(self):
- def f1(a, b, c):
- pass
-
- def f2(a, b=None, c=None):
- pass
-
- def f3(a, b=None, *, c=None):
- pass
-
- class A:
- def __init__(self, a, b, c):
- pass
-
- def method(self, a, b, c):
- pass
-
- class Callable:
-
- def __call__(self, a, b, c):
- pass
-
- a = A(1, 2, 3)
- cal = Callable()
- partial_f1 = functools.partial(f1, None)
- partial_f2 = functools.partial(f1, b=None)
- partial_f3 = functools.partial(partial_f2, None)
-
- self.assertEqual(get_func_args(f1), ['a', 'b', 'c'])
- self.assertEqual(get_func_args(f2), ['a', 'b', 'c'])
- self.assertEqual(get_func_args(f3), ['a', 'b', 'c'])
- self.assertEqual(get_func_args(A), ['a', 'b', 'c'])
- self.assertEqual(get_func_args(a.method), ['a', 'b', 'c'])
- self.assertEqual(get_func_args(partial_f1), ['b', 'c'])
- self.assertEqual(get_func_args(partial_f2), ['a', 'c'])
- self.assertEqual(get_func_args(partial_f3), ['c'])
- self.assertEqual(get_func_args(cal), ['a', 'b', 'c'])
- self.assertEqual(get_func_args(object), [])
-
- if platform.python_implementation() == 'CPython':
- # TODO: how do we fix this to return the actual argument names?
- self.assertEqual(get_func_args(str.split), [])
- self.assertEqual(get_func_args(" ".join), [])
- self.assertEqual(get_func_args(operator.itemgetter(2)), [])
- elif platform.python_implementation() == 'PyPy':
- self.assertEqual(get_func_args(str.split, stripself=True), ['sep', 'maxsplit'])
- self.assertEqual(get_func_args(operator.itemgetter(2), stripself=True), ['obj'])
-
- build_date = datetime.strptime(platform.python_build()[1], '%b %d %Y')
- if build_date >= datetime(2020, 4, 7): # PyPy 3.6-v7.3.1
- self.assertEqual(get_func_args(" ".join, stripself=True), ['iterable'])
- else:
- self.assertEqual(get_func_args(" ".join, stripself=True), ['list'])
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tox.ini b/tox.ini
deleted file mode 100644
index bf4e668..0000000
--- a/tox.ini
+++ /dev/null
@@ -1,38 +0,0 @@
-[tox]
-envlist = py35,py36,py37,py38,py39,py310
-
-[testenv]
-deps =
- pytest
- pytest-cov
-
-commands =
- py.test \
- --cov-report=term --cov-report=html --cov-report= --cov=itemloaders \
- --doctest-modules \
- {posargs:itemloaders tests}
-
-[testenv:extra-deps]
-deps =
- {[testenv]deps}
- attrs
- scrapy
-
-[testenv:pypy3]
-basepython = pypy3
-
-[docs]
-changedir = docs
-deps =
- -rdocs/requirements.txt
-setenv =
- READTHEDOCS_PROJECT=itemloaders
- READTHEDOCS_VERSION=master
-
-[testenv:docs]
-basepython = python3
-changedir = {[docs]changedir}
-deps = {[docs]deps}
-setenv = {[docs]setenv}
-commands =
- sphinx-build -W -b html . {envtmpdir}/html