scrapy · wRAR · Apr 11, 2023 · Jan 2, 2020 · Jan 2, 2020 · Jan 2, 2020
diff --git a/docs/conf.py b/docs/conf.py
@@ -127,6 +127,8 @@
 
 # nitpicky = True  # https://github.com/scrapy/cssselect/pull/110
 nitpick_ignore = [
+    ('py:class', 'ExpressionError'),
+    ('py:class', 'SelectorSyntaxError'),
     ('py:class', 'cssselect.xpath.GenericTranslator'),
     ('py:class', 'cssselect.xpath.HTMLTranslator'),
     ('py:class', 'cssselect.xpath.XPathExpr'),

diff --git a/parsel/selector.py b/parsel/selector.py
@@ -1,9 +1,11 @@
 """
-XPath selectors based on lxml
+XPath and JMESPath selectors based on lxml and jmespath
 """
 
+import json
 import sys
 
+import jmespath
 import six
 from lxml import etree, html
 
@@ -35,15 +37,6 @@ def __init__(self, *args, **kwargs):
 }
 
 
-def _st(st):
-    if st is None:
-        return 'html'
-    elif st in _ctgroup:
-        return st
-    else:
-        raise ValueError('Invalid type: %s' % st)
-
-
 def create_root_node(text, parser_cls, base_url=None):
     """Create root node for text using given parser class.
     """
@@ -73,12 +66,26 @@ def __getitem__(self, pos):
     def __getstate__(self):
         raise TypeError("can't pickle SelectorList objects")
 
+    def jmespath(self, query, **kwargs):
+        """
+        Call the ``.jmespath()`` method for each element in this list and return
+        their results flattened as another :class:`SelectorList`.
+
+        ``query`` is the same argument as the one in :meth:`Selector.jmespath`
+
+        Any additional named arguments are passed to the underlying
+        ``jmespath.search`` call, e.g.::
+
+            selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
+        """
+        return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
+
     def xpath(self, xpath, namespaces=None, **kwargs):
         """
         Call the ``.xpath()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
 
-        ``query`` is the same argument as the one in :meth:`Selector.xpath`
+        ``xpath`` is the same argument as the one in :meth:`Selector.xpath`
 
         ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
         for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
@@ -135,6 +142,7 @@ def getall(self):
         their results flattened, as a list of unicode strings.
         """
         return [x.get() for x in self]
+
     extract = getall
 
     def get(self, default=None):
@@ -145,6 +153,7 @@ def get(self, default=None):
         for x in self:
             return x.get()
         return default
+
     extract_first = get
 
     @property
@@ -164,24 +173,32 @@ def remove(self):
             x.remove()
 
 
+_NOTSET = object()
+
+
+def _load_json_or_none(text):
+    try:
+        return json.loads(text)
+    except ValueError:
+        return None
+
+
 class Selector(object):
     """
     :class:`Selector` allows you to select parts of an XML or HTML text using CSS
     or XPath expressions and extract data from it.
 
     ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3
 
-    ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
-    If ``type`` is ``None``, the selector defaults to ``"html"``.
+    ``type`` defines the selector type. It can be ``"html"`` (default),
+    ``"json"``, or ``"xml"``.
 
     ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
     See [`lxml` documentation](https://lxml.de/api/index.html) ``lxml.etree.fromstring`` for more information.
     """
 
-    __slots__ = ['text', 'namespaces', 'type', '_expr', 'root',
-                 '__weakref__', '_parser', '_csstranslator', '_tostring_method']
+    __slots__ = ['namespaces', 'type', '_expr', 'root', '_text', '__weakref__']
 
-    _default_type = None
     _default_namespaces = {
         "re": "http://exslt.org/regular-expressions",
 
@@ -196,33 +213,95 @@ class Selector(object):
     _lxml_smart_strings = False
     selectorlist_cls = SelectorList
 
-    def __init__(self, text=None, type=None, namespaces=None, root=None,
+    def __init__(self, text=None, type=None, namespaces=None, root=_NOTSET,
                  base_url=None, _expr=None):
-        self.type = st = _st(type or self._default_type)
-        self._parser = _ctgroup[st]['_parser']
-        self._csstranslator = _ctgroup[st]['_csstranslator']
-        self._tostring_method = _ctgroup[st]['_tostring_method']
+        if type not in ('html', 'json', 'text', 'xml', None):
+            raise ValueError('Invalid type: %s' % type)
 
-        if text is not None:
-            if not isinstance(text, six.text_type):
-                msg = "text argument should be of type %s, got %s" % (
-                    six.text_type, text.__class__)
-                raise TypeError(msg)
-            root = self._get_root(text, base_url)
-        elif root is None:
+        self._text = text
+
+        if text is None and root is _NOTSET:
             raise ValueError("Selector needs either text or root argument")
 
+        if text is not None and not isinstance(text, six.text_type):
+            msg = "text argument should be of type %s, got %s" % (
+                six.text_type, text.__class__)
+            raise TypeError(msg)
+
+        if text is not None:
+            if type in ('html', 'xml', None):
+                self._load_lxml_root(text, type=type or 'html', base_url=base_url)
+            elif type == 'json':
+                self.root = _load_json_or_none(text)
+                self.type = type
+            else:
+                self.root = text
+                self.type = type
+        else:
+            self.root = root
+            if type is None and isinstance(self.root, etree._Element):
+                type = 'html'
+            self.type = type or 'json'
+
+        self._expr = _expr
         self.namespaces = dict(self._default_namespaces)
         if namespaces is not None:
             self.namespaces.update(namespaces)
-        self.root = root
-        self._expr = _expr
+
+    def _load_lxml_root(self, text, type, base_url=None):
+        self.type = type
+        self.root = self._get_root(text, base_url)
 
     def __getstate__(self):
         raise TypeError("can't pickle Selector objects")
 
     def _get_root(self, text, base_url=None):
-        return create_root_node(text, self._parser, base_url=base_url)
+        return create_root_node(
+            text,
+            _ctgroup[self.type]['_parser'],
+            base_url=base_url,
+        )
+
+    def jmespath(self, query, type=None, **kwargs):
+        """
+        Find objects matching the JMESPath ``query`` and return the result as a
+        :class:`SelectorList` instance with all elements flattened. List
+        elements implement :class:`Selector` interface too.
+
+        ``query`` is a string containing the `JMESPath
+        <https://jmespath.org/>`_ query to apply.
+
+        ``type`` is a string that allows the same values as the matching
+        argument of the ``__init__`` method. If not specified, it defaults to
+        ``"json"``.
+
+        Any additional named arguments are passed to the underlying
+        ``jmespath.search`` call, e.g.::
+
+            selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
+        """
+        if self.type == 'json':
+            data = self.root
+        elif isinstance(self.root, six.string_types):
+            data = _load_json_or_none(self.root)
+        elif self.root.text is None:
+            data = _load_json_or_none(self._text)
+        else:
+            data = _load_json_or_none(self.root.text)
+        result = jmespath.search(query, data, **kwargs)
+        if result is None:
+            result = []
+        elif not isinstance(result, list):
+            result = [result]
+
+        def make_selector(x):  # closure function
+            if isinstance(x, six.text_type):
+                return self.__class__(text=x, _expr=query, type=type or 'text')
+            else:
+                return self.__class__(root=x, _expr=query, type=type)
+
+        result = [make_selector(x) for x in result]
+        return self.selectorlist_cls(result)
 
     def xpath(self, query, namespaces=None, **kwargs):
         """
@@ -242,6 +321,11 @@ def xpath(self, query, namespaces=None, **kwargs):
 
             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
+        if self.type == 'text':
+            self._load_lxml_root(self.root, type='html')
+        elif self.type not in ('html', 'xml'):
+            raise ValueError('Cannot use xpath on a Selector of type {}'
+                             .format(repr(self.type)))
         try:
             xpathev = self.root.xpath
         except AttributeError:
@@ -279,10 +363,15 @@ def css(self, query):
 
         .. _cssselect: https://pypi.python.org/pypi/cssselect/
         """
+        if self.type == 'text':
+            self._load_lxml_root(self.root, type='html')
+        elif self.type not in ('html', 'xml'):
+            raise ValueError('Cannot use css on a Selector of type {}'
+                             .format(repr(self.type)))
         return self.xpath(self._css2xpath(query))
 
     def _css2xpath(self, query):
-        return self._csstranslator.css_to_xpath(query)
+        return _ctgroup[self.type]['_csstranslator'].css_to_xpath(query)
 
     def re(self, regex, replace_entities=True):
         """
@@ -317,18 +406,23 @@ def get(self):
         Serialize and return the matched nodes in a single unicode string.
         Percent encoded content is unquoted.
         """
+        if self.type in ('text', 'json'):
+            return self.root
         try:
-            return etree.tostring(self.root,
-                                  method=self._tostring_method,
-                                  encoding='unicode',
-                                  with_tail=False)
+            return etree.tostring(
+                self.root,
+                method=_ctgroup[self.type]['_tostring_method'],
+                encoding='unicode',
+                with_tail=False,
+            )
         except (AttributeError, TypeError):
             if self.root is True:
                 return u'1'
             elif self.root is False:
                 return u'0'
             else:
                 return six.text_type(self.root)
+
     extract = get
 
     def getall(self):
@@ -397,9 +491,12 @@ def __bool__(self):
         given by the contents it selects.
         """
         return bool(self.get())
+
     __nonzero__ = __bool__
 
     def __str__(self):
         data = repr(shorten(self.get(), width=40))
-        return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
+        expr_field = 'jmespath' if self.type == 'json' else 'xpath'
+        return "<%s %s=%r data=%s>" % (type(self).__name__, expr_field, self._expr, data)
+
     __repr__ = __str__
diff --git a/pylintrc b/pylintrc
@@ -4,7 +4,7 @@ persistent=no
 [MESSAGES CONTROL]
 disable=bad-continuation,
         c-extension-no-member,
-        deprecated-method,
+        deprecated-method,  # Required for Python 2 support
         fixme,
         import-error,
         import-outside-toplevel,
@@ -27,4 +27,4 @@ disable=bad-continuation,
         unused-argument,
         useless-object-inheritance,  # Required for Python 2 support
         wrong-import-order,
-        wrong-import-position
+        wrong-import-position,
diff --git a/pytest.ini b/pytest.ini
@@ -7,5 +7,6 @@ flake8-ignore =
     parsel/xpathfuncs.py E501
     tests/test_selector.py E501
     tests/test_selector_csstranslator.py E501
+    tests/test_selector_jmespath.py E501
     tests/test_utils.py E501
     tests/test_xpathfuncs.py E501
diff --git a/setup.py b/setup.py
@@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support():
     'w3lib>=1.19.0',
     'lxml',
     'six>=1.6.0',
-    'cssselect>=0.9'
+    'cssselect>=0.9',
+    'jmespath',
 ]
 extras_require = {}
 

diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -5,6 +5,8 @@
 import unittest
 import pickle
 
+import lxml.etree
+
 from parsel import Selector
 from parsel.selector import (
     CannotRemoveElementWithoutRoot,
@@ -814,6 +816,51 @@ def test_remove_root_element_selector(self):
         sel.css('body').remove()
         self.assertEqual(sel.get(), '<html></html>')
 
+    def test_invalid_type(self):
+        with self.assertRaises(ValueError):
+            self.sscls(u'', type='xhtml')
+
+    def test_default_type(self):
+        text = u'foo'
+        selector = self.sscls(text)
+        self.assertEqual(selector.type, 'html')
+
+    def test_json_type(self):
+        obj = 1
+        selector = self.sscls(six.text_type(obj), type='json')
+        self.assertEqual(selector.root, obj)
+        self.assertEqual(selector.type, 'json')
+
+    def test_html_root(self):
+        root = lxml.etree.fromstring('<html/>')
+        selector = self.sscls(root=root)
+        self.assertEqual(selector.root, root)
+        self.assertEqual(selector.type, 'html')
+
+    def test_json_root(self):
+        obj = 1
+        selector = self.sscls(root=obj)
+        self.assertEqual(selector.root, obj)
+        self.assertEqual(selector.type, 'json')
+
+    def test_json_xpath(self):
+        obj = 1
+        selector = self.sscls(root=obj)
+        with self.assertRaises(ValueError):
+            selector.xpath('//*')
+
+    def test_json_css(self):
+        obj = 1
+        selector = self.sscls(root=obj)
+        with self.assertRaises(ValueError):
+            selector.css('*')
+
+    def test_invalid_json(self):
+        text = u'<html/>'
+        selector = self.sscls(text, type='json')
+        self.assertEqual(selector.root, None)
+        self.assertEqual(selector.type, 'json')
+
 
 class ExsltTestCase(unittest.TestCase):