# Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
#
# Distributed under the BSD license, see LICENSE.txt
from .cssselectpatch import JQueryTranslator
from collections import OrderedDict
from urllib.parse import urlencode
from urllib.parse import urljoin
from .openers import url_opener
from .text import extract_text
from copy import deepcopy
from lxml import etree
import lxml.html
import inspect
import types
basestring = (str, bytes)
def getargspec(func):
args = inspect.signature(func).parameters.values()
return [p.name for p in args
if p.kind == p.POSITIONAL_OR_KEYWORD]
def with_camel_case_alias(func):
"""decorator for methods who required a camelcase alias"""
_camel_case_aliases.add(func.__name__)
return func
_camel_case_aliases = set()
def build_camel_case_aliases(PyQuery):
"""add camelcase aliases to PyQuery"""
for alias in _camel_case_aliases:
parts = list(alias.split('_'))
name = parts[0] + ''.join([p.title() for p in parts[1:]])
func = getattr(PyQuery, alias)
f = types.FunctionType(func.__code__, func.__globals__,
name, func.__defaults__)
f.__doc__ = (
'Alias for :func:`~pyquery.pyquery.PyQuery.%s`') % func.__name__
setattr(PyQuery, name, f.__get__(None, PyQuery))
def fromstring(context, parser=None, custom_parser=None):
"""use html parser if we don't have clean xml
"""
if hasattr(context, 'read') and hasattr(context.read, '__call__'):
meth = 'parse'
else:
meth = 'fromstring'
if custom_parser is None:
if parser is None:
try:
result = getattr(etree, meth)(context)
except etree.XMLSyntaxError:
if hasattr(context, 'seek'):
context.seek(0)
result = getattr(lxml.html, meth)(context)
if isinstance(result, etree._ElementTree):
return [result.getroot()]
else:
return [result]
elif parser == 'xml':
custom_parser = getattr(etree, meth)
elif parser == 'html':
custom_parser = getattr(lxml.html, meth)
elif parser == 'html5':
from lxml.html import html5parser
custom_parser = getattr(html5parser, meth)
elif parser == 'soup':
from lxml.html import soupparser
custom_parser = getattr(soupparser, meth)
elif parser == 'html_fragments':
custom_parser = lxml.html.fragments_fromstring
else:
raise ValueError('No such parser: "%s"' % parser)
result = custom_parser(context)
if type(result) is list:
return result
elif isinstance(result, etree._ElementTree):
return [result.getroot()]
elif result is not None:
return [result]
else:
return []
def callback(func, *args):
return func(*args[:func.__code__.co_argcount])
class NoDefault(object):
def __repr__(self):
"""clean representation in Sphinx"""
return '<NoDefault>'
no_default = NoDefault()
del NoDefault
class FlexibleElement(object):
"""property to allow a flexible api"""
def __init__(self, pget, pset=no_default, pdel=no_default):
self.pget = pget
self.pset = pset
self.pdel = pdel
def __get__(self, instance, klass):
class _element(object):
"""real element to support set/get/del attr and item and js call
style"""
def __call__(prop, *args, **kwargs):
return self.pget(instance, *args, **kwargs)
__getattr__ = __getitem__ = __setattr__ = __setitem__ = __call__
def __delitem__(prop, name):
if self.pdel is not no_default:
return self.pdel(instance, name)
else:
raise NotImplementedError()
__delattr__ = __delitem__
def __repr__(prop):
return '<flexible_element %s>' % self.pget.__name__
return _element()
def __set__(self, instance, value):
if self.pset is not no_default:
self.pset(instance, value)
else:
raise NotImplementedError()
[docs]class PyQuery(list):
"""The main class
"""
_translator_class = JQueryTranslator
def __init__(self, *args, **kwargs):
html = None
elements = []
self._base_url = None
self.parser = kwargs.pop('parser', None)
if (len(args) >= 1 and
isinstance(args[0], str) and
args[0].split('://', 1)[0] in ('http', 'https')):
kwargs['url'] = args[0]
if len(args) >= 2:
kwargs['data'] = args[1]
args = []
if 'parent' in kwargs:
self._parent = kwargs.pop('parent')
else:
self._parent = no_default
if 'css_translator' in kwargs:
self._translator = kwargs.pop('css_translator')
elif self.parser in ('xml',):
self._translator = self._translator_class(xhtml=True)
elif self._parent is not no_default:
self._translator = self._parent._translator
else:
self._translator = self._translator_class(xhtml=False)
self.namespaces = kwargs.pop('namespaces', None)
if kwargs:
# specific case to get the dom
if 'filename' in kwargs:
html = open(kwargs['filename'])
elif 'url' in kwargs:
url = kwargs.pop('url')
if 'opener' in kwargs:
opener = kwargs.pop('opener')
html = opener(url, **kwargs)
else:
html = url_opener(url, kwargs)
if not self.parser:
self.parser = 'html'
self._base_url = url
else:
raise ValueError('Invalid keyword arguments %s' % kwargs)
elements = fromstring(html, self.parser)
# close open descriptor if possible
if hasattr(html, 'close'):
try:
html.close()
except Exception:
pass
else:
# get nodes
# determine context and selector if any
selector = context = no_default
length = len(args)
if length == 1:
context = args[0]
elif length == 2:
selector, context = args
else:
raise ValueError(
"You can't do that. Please, provide arguments")
# get context
if isinstance(context, basestring):
try:
elements = fromstring(context, self.parser)
except Exception:
raise
elif isinstance(context, self.__class__):
# copy
elements = context[:]
elif isinstance(context, list):
elements = context
elif isinstance(context, etree._Element):
elements = [context]
else:
raise TypeError(context)
# select nodes
if elements and selector is not no_default:
xpath = self._css_to_xpath(selector)
results = []
for tag in elements:
results.extend(
tag.xpath(xpath, namespaces=self.namespaces))
elements = results
list.__init__(self, elements)
def _css_to_xpath(self, selector, prefix='descendant-or-self::'):
selector = selector.replace('[@', '[')
return self._translator.css_to_xpath(selector, prefix)
def _copy(self, *args, **kwargs):
kwargs.setdefault('namespaces', self.namespaces)
return self.__class__(*args, **kwargs)
def __call__(self, *args, **kwargs):
"""return a new PyQuery instance
"""
length = len(args)
if length == 0:
raise ValueError('You must provide at least a selector')
if args[0] == '':
return self._copy([])
if (len(args) == 1 and
isinstance(args[0], str) and
not args[0].startswith('<')):
args += (self,)
result = self._copy(*args, parent=self, **kwargs)
return result
# keep original list api prefixed with _
_append = list.append
_extend = list.extend
# improve pythonic api
def __add__(self, other):
assert isinstance(other, self.__class__)
return self._copy(self[:] + other[:])
[docs] def extend(self, other):
"""Extend with anoter PyQuery object"""
assert isinstance(other, self.__class__)
self._extend(other[:])
return self
[docs] def items(self, selector=None):
"""Iter over elements. Return PyQuery objects:
>>> d = PyQuery('<div><span>foo</span><span>bar</span></div>')
>>> [i.text() for i in d.items('span')]
['foo', 'bar']
>>> [i.text() for i in d('span').items()]
['foo', 'bar']
>>> list(d.items('a')) == list(d('a').items())
True
"""
if selector:
elems = self(selector) or []
else:
elems = self
for elem in elems:
yield self._copy(elem, parent=self)
[docs] def xhtml_to_html(self):
"""Remove xhtml namespace:
>>> doc = PyQuery(
... '<html xmlns="http://www.w3.org/1999/xhtml"></html>')
>>> doc
[<{http://www.w3.org/1999/xhtml}html>]
>>> doc.xhtml_to_html()
[<html>]
"""
try:
root = self[0].getroottree()
except IndexError:
pass
else:
lxml.html.xhtml_to_html(root)
return self
[docs] def remove_namespaces(self):
"""Remove all namespaces:
>>> doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>')
>>> doc
[<{http://example.com/foo}foo>]
>>> doc.remove_namespaces()
[<foo>]
"""
try:
root = self[0].getroottree()
except IndexError:
pass
else:
for el in root.iter('{*}*'):
if el.tag.startswith('{'):
el.tag = el.tag.split('}', 1)[1]
return self
def __str__(self):
"""xml representation of current nodes::
>>> xml = PyQuery(
... '<script><![[CDATA[ ]></script>', parser='html_fragments')
>>> print(str(xml))
<script><![[CDATA[ ]></script>
"""
return ''.join([etree.tostring(e, encoding=str) for e in self])
def __unicode__(self):
"""xml representation of current nodes"""
return u''.join([etree.tostring(e, encoding=str)
for e in self])
def __html__(self):
"""html representation of current nodes::
>>> html = PyQuery(
... '<script><![[CDATA[ ]></script>', parser='html_fragments')
>>> print(html.__html__())
<script><![[CDATA[ ]></script>
"""
return u''.join([lxml.html.tostring(e, encoding=str)
for e in self])
def __repr__(self):
r = []
try:
for el in self:
c = el.get('class')
c = c and '.' + '.'.join(c.split(' ')) or ''
id = el.get('id')
id = id and '#' + id or ''
r.append('<%s%s%s>' % (el.tag, id, c))
return '[' + (', '.join(r)) + ']'
except AttributeError:
return list.__repr__(self)
@property
def root(self):
"""return the xml root element
"""
if self._parent is not no_default:
return self._parent[0].getroottree()
return self[0].getroottree()
@property
def encoding(self):
"""return the xml encoding of the root element
"""
root = self.root
if root is not None:
return self.root.docinfo.encoding
##############
# Traversing #
##############
def _filter_only(self, selector, elements, reverse=False, unique=False):
"""Filters the selection set only, as opposed to also including
descendants.
"""
if selector is None:
results = elements
else:
xpath = self._css_to_xpath(selector, 'self::')
results = []
for tag in elements:
results.extend(tag.xpath(xpath, namespaces=self.namespaces))
if reverse:
results.reverse()
if unique:
result_list = results
results = []
for item in result_list:
if item not in results:
results.append(item)
return self._copy(results, parent=self)
def parent(self, selector=None):
return self._filter_only(
selector,
[e.getparent() for e in self if e.getparent() is not None],
unique=True)
def prev(self, selector=None):
return self._filter_only(
selector,
[e.getprevious() for e in self if e.getprevious() is not None])
def next(self, selector=None):
return self._filter_only(
selector,
[e.getnext() for e in self if e.getnext() is not None])
def _traverse(self, method):
for e in self:
current = getattr(e, method)()
while current is not None:
yield current
current = getattr(current, method)()
def _traverse_parent_topdown(self):
for e in self:
this_list = []
current = e.getparent()
while current is not None:
this_list.append(current)
current = current.getparent()
this_list.reverse()
for j in this_list:
yield j
def _next_all(self):
return [e for e in self._traverse('getnext')]
[docs] @with_camel_case_alias
def next_all(self, selector=None):
"""
>>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
>>> d = PyQuery(h)
>>> d('p:last').next_all()
[<img>]
>>> d('p:last').nextAll()
[<img>]
"""
return self._filter_only(selector, self._next_all())
def _prev_all(self):
return [e for e in self._traverse('getprevious')]
[docs] @with_camel_case_alias
def prev_all(self, selector=None):
"""
>>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
>>> d = PyQuery(h)
>>> d('p:last').prev_all()
[<p.hello>]
>>> d('p:last').prevAll()
[<p.hello>]
"""
return self._filter_only(selector, self._prev_all(), reverse=True)
[docs] def siblings(self, selector=None):
"""
>>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
>>> d = PyQuery(h)
>>> d('.hello').siblings()
[<p>, <img>]
>>> d('.hello').siblings('img')
[<img>]
"""
return self._filter_only(selector, self._prev_all() + self._next_all())
[docs] def parents(self, selector=None):
"""
>>> d = PyQuery('<span><p class="hello">Hi</p><p>Bye</p></span>')
>>> d('p').parents()
[<span>]
>>> d('.hello').parents('span')
[<span>]
>>> d('.hello').parents('p')
[]
"""
return self._filter_only(
selector,
[e for e in self._traverse_parent_topdown()],
unique=True
)
[docs] def children(self, selector=None):
"""Filter elements that are direct children of self using optional
selector:
>>> d = PyQuery('<span><p class="hello">Hi</p><p>Bye</p></span>')
>>> d
[<span>]
>>> d.children()
[<p.hello>, <p>]
>>> d.children('.hello')
[<p.hello>]
"""
elements = [child for tag in self for child in tag.getchildren()]
return self._filter_only(selector, elements)
[docs] def closest(self, selector=None):
"""
>>> d = PyQuery(
... '<div class="hello"><p>This is a '
... '<strong class="hello">test</strong></p></div>')
>>> d('strong').closest('div')
[<div.hello>]
>>> d('strong').closest('.hello')
[<strong.hello>]
>>> d('strong').closest('form')
[]
"""
result = []
for current in self:
while (current is not None and
not self._copy(current).is_(selector)):
current = current.getparent()
if current is not None:
result.append(current)
return self._copy(result, parent=self)
[docs] def contents(self):
"""
Return contents (with text nodes):
>>> d = PyQuery('hello <b>bold</b>')
>>> d.contents() # doctest: +ELLIPSIS
['hello ', <Element b at ...>]
"""
results = []
for elem in self:
results.extend(elem.xpath('child::text()|child::*',
namespaces=self.namespaces))
return self._copy(results, parent=self)
[docs] def filter(self, selector):
"""Filter elements in self using selector (string or function):
>>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p>')
>>> d('p')
[<p.hello>, <p>]
>>> d('p').filter('.hello')
[<p.hello>]
>>> d('p').filter(lambda i: i == 1)
[<p>]
>>> d('p').filter(lambda i: PyQuery(this).text() == 'Hi')
[<p.hello>]
>>> d('p').filter(lambda i, this: PyQuery(this).text() == 'Hi')
[<p.hello>]
"""
if not hasattr(selector, '__call__'):
return self._filter_only(selector, self)
else:
elements = []
args = getargspec(callback)
try:
for i, this in enumerate(self):
if len(args) == 1:
selector.__globals__['this'] = this
if callback(selector, i, this):
elements.append(this)
finally:
f_globals = selector.__globals__
if 'this' in f_globals:
del f_globals['this']
return self._copy(elements, parent=self)
[docs] def not_(self, selector):
"""Return elements that don't match the given selector:
>>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p><div></div>')
>>> d('p').not_('.hello')
[<p>]
"""
exclude = set(self._copy(selector, self))
return self._copy([e for e in self if e not in exclude],
parent=self)
[docs] def is_(self, selector):
"""Returns True if selector matches at least one current element, else
False:
>>> d = PyQuery('<p class="hello"><span>Hi</span></p><p>Bye</p>')
>>> d('p').eq(0).is_('.hello')
True
>>> d('p').eq(0).is_('span')
False
>>> d('p').eq(1).is_('.hello')
False
..
"""
return bool(self._filter_only(selector, self))
[docs] def find(self, selector):
"""Find elements using selector traversing down from self:
>>> m = '<p><span><em>Whoah!</em></span></p><p><em> there</em></p>'
>>> d = PyQuery(m)
>>> d('p').find('em')
[<em>, <em>]
>>> d('p').eq(1).find('em')
[<em>]
"""
xpath = self._css_to_xpath(selector)
results = [child.xpath(xpath, namespaces=self.namespaces)
for tag in self
for child in tag.getchildren()]
# Flatten the results
elements = []
for r in results:
elements.extend(r)
return self._copy(elements, parent=self)
[docs] def eq(self, index):
"""Return PyQuery of only the element with the provided index::
>>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p><div></div>')
>>> d('p').eq(0)
[<p.hello>]
>>> d('p').eq(1)
[<p>]
>>> d('p').eq(2)
[]
..
"""
# Slicing will return empty list when index=-1
# we should handle out of bound by ourselves
try:
items = self[index]
except IndexError:
items = []
return self._copy(items, parent=self)
[docs] def each(self, func):
"""apply func on each nodes
"""
try:
for i, element in enumerate(self):
func.__globals__['this'] = element
if callback(func, i, element) is False:
break
finally:
f_globals = func.__globals__
if 'this' in f_globals:
del f_globals['this']
return self
[docs] def map(self, func):
"""Returns a new PyQuery after transforming current items with func.
func should take two arguments - 'index' and 'element'. Elements can
also be referred to as 'this' inside of func::
>>> d = PyQuery('<p class="hello">Hi there</p><p>Bye</p><br />')
>>> d('p').map(lambda i, e: PyQuery(e).text())
['Hi there', 'Bye']
>>> d('p').map(lambda i, e: len(PyQuery(this).text()))
[8, 3]
>>> d('p').map(lambda i, e: PyQuery(this).text().split())
['Hi', 'there', 'Bye']
"""
items = []
try:
for i, element in enumerate(self):
func.__globals__['this'] = element
result = callback(func, i, element)
if result is not None:
if not isinstance(result, list):
items.append(result)
else:
items.extend(result)
finally:
f_globals = func.__globals__
if 'this' in f_globals:
del f_globals['this']
return self._copy(items, parent=self)
@property
def length(self):
return len(self)
def size(self):
return len(self)
[docs] def end(self):
"""Break out of a level of traversal and return to the parent level.
>>> m = '<p><span><em>Whoah!</em></span></p><p><em> there</em></p>'
>>> d = PyQuery(m)
>>> d('p').eq(1).find('em').end().end()
[<p>, <p>]
"""
return self._parent
##############
# Attributes #
##############
def attr(self, *args, **kwargs):
"""Attributes manipulation
"""
mapping = {'class_': 'class', 'for_': 'for'}
attr = value = no_default
length = len(args)
if length == 1:
attr = args[0]
attr = mapping.get(attr, attr)
elif length == 2:
attr, value = args
attr = mapping.get(attr, attr)
elif kwargs:
attr = {}
for k, v in kwargs.items():
attr[mapping.get(k, k)] = v
else:
raise ValueError('Invalid arguments %s %s' % (args, kwargs))
if not self:
return None
elif isinstance(attr, dict):
for tag in self:
for key, value in attr.items():
tag.set(key, value)
elif value is no_default:
return self[0].get(attr)
elif value is None:
return self.remove_attr(attr)
else:
for tag in self:
tag.set(attr, value)
return self
[docs] @with_camel_case_alias
def remove_attr(self, name):
"""Remove an attribute::
>>> d = PyQuery('<div id="myid"></div>')
>>> d.remove_attr('id')
[<div>]
>>> d.removeAttr('id')
[<div>]
..
"""
for tag in self:
try:
del tag.attrib[name]
except KeyError:
pass
return self
attr = FlexibleElement(pget=attr, pdel=remove_attr)
#######
# CSS #
#######
[docs] def height(self, value=no_default):
"""set/get height of element
"""
return self.attr('height', value)
[docs] def width(self, value=no_default):
"""set/get width of element
"""
return self.attr('width', value)
[docs] @with_camel_case_alias
def has_class(self, name):
"""Return True if element has class::
>>> d = PyQuery('<div class="myclass"></div>')
>>> d.has_class('myclass')
True
>>> d.hasClass('myclass')
True
..
"""
return self.is_('.%s' % name)
[docs] @with_camel_case_alias
def add_class(self, value):
"""Add a css class to elements::
>>> d = PyQuery('<div></div>')
>>> d.add_class('myclass')
[<div.myclass>]
>>> d.addClass('myclass')
[<div.myclass>]
..
"""
for tag in self:
values = value.split(' ')
classes = (tag.get('class') or '').split()
classes += [v for v in values if v not in classes]
tag.set('class', ' '.join(classes))
return self
[docs] @with_camel_case_alias
def remove_class(self, value):
"""Remove a css class to elements::
>>> d = PyQuery('<div class="myclass"></div>')
>>> d.remove_class('myclass')
[<div>]
>>> d.removeClass('myclass')
[<div>]
..
"""
for tag in self:
values = value.split(' ')
classes = set((tag.get('class') or '').split())
classes.difference_update(values)
classes.difference_update([''])
classes = ' '.join(classes)
if classes.strip():
tag.set('class', classes)
elif tag.get('class'):
tag.set('class', classes)
return self
[docs] @with_camel_case_alias
def toggle_class(self, value):
"""Toggle a css class to elements
>>> d = PyQuery('<div></div>')
>>> d.toggle_class('myclass')
[<div.myclass>]
>>> d.toggleClass('myclass')
[<div>]
"""
for tag in self:
values = value.split(' ')
classes = (tag.get('class') or '').split()
values_to_add = [v for v in values if v not in classes]
values_to_del = [v for v in values if v in classes]
classes = [v for v in classes if v not in values_to_del]
classes += values_to_add
tag.set('class', ' '.join(classes))
return self
def css(self, *args, **kwargs):
"""css attributes manipulation
"""
attr = value = no_default
length = len(args)
if length == 1:
attr = args[0]
elif length == 2:
attr, value = args
elif kwargs:
attr = kwargs
else:
raise ValueError('Invalid arguments %s %s' % (args, kwargs))
if isinstance(attr, dict):
for tag in self:
stripped_keys = [key.strip().replace('_', '-')
for key in attr.keys()]
current = [el.strip()
for el in (tag.get('style') or '').split(';')
if el.strip()
and not el.split(':')[0].strip() in stripped_keys]
for key, value in attr.items():
key = key.replace('_', '-')
current.append('%s: %s' % (key, value))
tag.set('style', '; '.join(current))
elif isinstance(value, basestring):
attr = attr.replace('_', '-')
for tag in self:
current = [
el.strip()
for el in (tag.get('style') or '').split(';')
if (el.strip() and
not el.split(':')[0].strip() == attr.strip())]
current.append('%s: %s' % (attr, value))
tag.set('style', '; '.join(current))
return self
css = FlexibleElement(pget=css, pset=css)
###################
# CORE UI EFFECTS #
###################
[docs] def hide(self):
"""Remove display:none to elements style:
>>> print(PyQuery('<div style="display:none;"/>').hide())
<div style="display: none"/>
"""
return self.css('display', 'none')
[docs] def show(self):
"""Add display:block to elements style:
>>> print(PyQuery('<div />').show())
<div style="display: block"/>
"""
return self.css('display', 'block')
########
# HTML #
########
[docs] def val(self, value=no_default):
"""Set the attribute value::
>>> d = PyQuery('<input />')
>>> d.val('Youhou')
[<input>]
Get the attribute value::
>>> d.val()
'Youhou'
Set the selected values for a `select` element with the `multiple`
attribute::
>>> d = PyQuery('''
... <select multiple>
... <option value="you"><option value="hou">
... </select>
... ''')
>>> d.val(['you', 'hou'])
[<select>]
Get the selected values for a `select` element with the `multiple`
attribute::
>>> d.val()
['you', 'hou']
"""
def _get_value(tag):
# <textarea>
if tag.tag == 'textarea':
return self._copy(tag).html()
# <select>
elif tag.tag == 'select':
if 'multiple' in tag.attrib:
# Only extract value if selected
selected = self._copy(tag)('option[selected]')
# Rebuild list to avoid serialization error
return list(selected.map(
lambda _, o: self._copy(o).attr('value')
))
selected_option = self._copy(tag)('option[selected]:last')
if selected_option:
return selected_option.attr('value')
else:
return self._copy(tag)('option').attr('value')
# <input type="checkbox"> or <input type="radio">
elif self.is_(':checkbox,:radio'):
val = self._copy(tag).attr('value')
if val is None:
return 'on'
else:
return val
# <input>
elif tag.tag == 'input':
val = self._copy(tag).attr('value')
return val.replace('\n', '') if val else ''
# everything else.
return self._copy(tag).attr('value') or ''
def _set_value(pq, value):
for tag in pq:
# <select>
if tag.tag == 'select':
if not isinstance(value, list):
value = [value]
def _make_option_selected(_, elem):
pq = self._copy(elem)
if pq.attr('value') in value:
pq.attr('selected', 'selected')
if 'multiple' not in tag.attrib:
del value[:] # Ensure it toggles first match
else:
pq.removeAttr('selected')
self._copy(tag)('option').each(_make_option_selected)
continue
# Stringify array
if isinstance(value, list):
value = ','.join(value)
# <textarea>
if tag.tag == 'textarea':
self._copy(tag).text(value)
continue
# <input> and everything else.
self._copy(tag).attr('value', value)
if value is no_default:
if len(self):
return _get_value(self[0])
else:
_set_value(self, value)
return self
[docs] def html(self, value=no_default, **kwargs):
"""Get or set the html representation of sub nodes.
Get the text value::
>>> d = PyQuery('<div><span>toto</span></div>')
>>> print(d.html())
<span>toto</span>
Extra args are passed to ``lxml.etree.tostring``::
>>> d = PyQuery('<div><span></span></div>')
>>> print(d.html())
<span/>
>>> print(d.html(method='html'))
<span></span>
Set the text value::
>>> d.html('<span>Youhou !</span>')
[<div>]
>>> print(d)
<div><span>Youhou !</span></div>
"""
if value is no_default:
if not self:
return None
tag = self[0]
children = tag.getchildren()
if not children:
return tag.text or ''
html = tag.text or ''
if 'encoding' not in kwargs:
kwargs['encoding'] = str
html += u''.join([etree.tostring(e, **kwargs)
for e in children])
return html
else:
if isinstance(value, self.__class__):
new_html = str(value)
elif isinstance(value, basestring):
new_html = value
elif not value:
new_html = ''
else:
raise ValueError(type(value))
for tag in self:
for child in tag.getchildren():
tag.remove(child)
root = fromstring(
u'<root>' + new_html + u'</root>',
self.parser)[0]
children = root.getchildren()
if children:
tag.extend(children)
tag.text = root.text
return self
[docs] @with_camel_case_alias
def outer_html(self, method="html"):
"""Get the html representation of the first selected element::
>>> d = PyQuery('<div><span class="red">toto</span> rocks</div>')
>>> print(d('span'))
<span class="red">toto</span> rocks
>>> print(d('span').outer_html())
<span class="red">toto</span>
>>> print(d('span').outerHtml())
<span class="red">toto</span>
>>> S = PyQuery('<p>Only <b>me</b> & myself</p>')
>>> print(S('b').outer_html())
<b>me</b>
..
"""
if not self:
return None
e0 = self[0]
if e0.tail:
e0 = deepcopy(e0)
e0.tail = ''
return etree.tostring(e0, encoding=str, method=method)
[docs] def text(self, value=no_default, **kwargs):
"""Get or set the text representation of sub nodes.
Get the text value::
>>> doc = PyQuery('<div><span>toto</span><span>tata</span></div>')
>>> print(doc.text())
tototata
>>> doc = PyQuery('''<div><span>toto</span>
... <span>tata</span></div>''')
>>> print(doc.text())
toto tata
Get the text value, without squashing newlines::
>>> doc = PyQuery('''<div><span>toto</span>
... <span>tata</span></div>''')
>>> print(doc.text(squash_space=False))
toto
tata
Set the text value::
>>> doc.text('Youhou !')
[<div>]
>>> print(doc)
<div>Youhou !</div>
"""
if value is no_default:
if not self:
return ''
return ' '.join(
self._copy(tag).html() if tag.tag == 'textarea' else
extract_text(tag, **kwargs) for tag in self
)
for tag in self:
for child in tag.getchildren():
tag.remove(child)
tag.text = value
return self
################
# Manipulating #
################
def _get_root(self, value):
if isinstance(value, basestring):
root = fromstring(u'<root>' + value + u'</root>',
self.parser)[0]
elif isinstance(value, etree._Element):
root = self._copy(value)
elif isinstance(value, PyQuery):
root = value
else:
raise TypeError(
'Value must be string, PyQuery or Element. Got %r' % value)
if hasattr(root, 'text') and isinstance(root.text, basestring):
root_text = root.text
else:
root_text = ''
return root, root_text
[docs] def append(self, value):
"""append value to each nodes
"""
root, root_text = self._get_root(value)
for i, tag in enumerate(self):
if len(tag) > 0: # if the tag has children
last_child = tag[-1]
if not last_child.tail:
last_child.tail = ''
last_child.tail += root_text
else:
if not tag.text:
tag.text = ''
tag.text += root_text
if i > 0:
root = deepcopy(list(root))
tag.extend(root)
return self
[docs] @with_camel_case_alias
def append_to(self, value):
"""append nodes to value
"""
value.append(self)
return self
[docs] def prepend(self, value):
"""prepend value to nodes
"""
root, root_text = self._get_root(value)
for i, tag in enumerate(self):
if not tag.text:
tag.text = ''
if len(root) > 0:
root[-1].tail = tag.text
tag.text = root_text
else:
tag.text = root_text + tag.text
if i > 0:
root = deepcopy(list(root))
tag[:0] = root
root = tag[:len(root)]
return self
[docs] @with_camel_case_alias
def prepend_to(self, value):
"""prepend nodes to value
"""
value.prepend(self)
return self
[docs] def after(self, value):
"""add value after nodes
"""
root, root_text = self._get_root(value)
for i, tag in enumerate(self):
if not tag.tail:
tag.tail = ''
tag.tail += root_text
if i > 0:
root = deepcopy(list(root))
parent = tag.getparent()
index = parent.index(tag) + 1
parent[index:index] = root
root = parent[index:len(root)]
return self
[docs] @with_camel_case_alias
def insert_after(self, value):
"""insert nodes after value
"""
value.after(self)
return self
[docs] def before(self, value):
"""insert value before nodes
"""
root, root_text = self._get_root(value)
for i, tag in enumerate(self):
previous = tag.getprevious()
if previous is not None:
if not previous.tail:
previous.tail = ''
previous.tail += root_text
else:
parent = tag.getparent()
if not parent.text:
parent.text = ''
parent.text += root_text
if i > 0:
root = deepcopy(list(root))
parent = tag.getparent()
index = parent.index(tag)
parent[index:index] = root
root = parent[index:len(root)]
return self
[docs] @with_camel_case_alias
def insert_before(self, value):
"""insert nodes before value
"""
value.before(self)
return self
[docs] def wrap(self, value):
"""A string of HTML that will be created on the fly and wrapped around
each target:
>>> d = PyQuery('<span>youhou</span>')
>>> d.wrap('<div></div>')
[<div>]
>>> print(d)
<div><span>youhou</span></div>
"""
assert isinstance(value, basestring)
value = fromstring(value)[0]
nodes = []
for tag in self:
wrapper = deepcopy(value)
# FIXME: using iterchildren is probably not optimal
if not wrapper.getchildren():
wrapper.append(deepcopy(tag))
else:
childs = [c for c in wrapper.iterchildren()]
child = childs[-1]
child.append(deepcopy(tag))
nodes.append(wrapper)
parent = tag.getparent()
if parent is not None:
for t in parent.iterchildren():
if t is tag:
t.addnext(wrapper)
parent.remove(t)
break
self[:] = nodes
return self
[docs] @with_camel_case_alias
def wrap_all(self, value):
"""Wrap all the elements in the matched set into a single wrapper
element::
>>> d = PyQuery('<div><span>Hey</span><span>you !</span></div>')
>>> print(d('span').wrap_all('<div id="wrapper"></div>'))
<div id="wrapper"><span>Hey</span><span>you !</span></div>
>>> d = PyQuery('<div><span>Hey</span><span>you !</span></div>')
>>> print(d('span').wrapAll('<div id="wrapper"></div>'))
<div id="wrapper"><span>Hey</span><span>you !</span></div>
..
"""
if not self:
return self
assert isinstance(value, basestring)
value = fromstring(value)[0]
wrapper = deepcopy(value)
if not wrapper.getchildren():
child = wrapper
else:
childs = [c for c in wrapper.iterchildren()]
child = childs[-1]
replace_childs = True
parent = self[0].getparent()
if parent is None:
parent = no_default
# add nodes to wrapper and check parent
for tag in self:
child.append(deepcopy(tag))
if tag.getparent() is not parent:
replace_childs = False
# replace nodes i parent if possible
if parent is not no_default and replace_childs:
childs = [c for c in parent.iterchildren()]
if len(childs) == len(self):
for tag in self:
parent.remove(tag)
parent.append(wrapper)
self[:] = [wrapper]
return self
[docs] @with_camel_case_alias
def replace_with(self, value):
"""replace nodes by value:
>>> doc = PyQuery("<html><div /></html>")
>>> node = PyQuery("<span />")
>>> child = doc.find('div')
>>> child.replace_with(node)
[<div>]
>>> print(doc)
<html><span/></html>
"""
if isinstance(value, PyQuery):
value = str(value)
if hasattr(value, '__call__'):
for i, element in enumerate(self):
self._copy(element).before(
value(i, element) + (element.tail or ''))
parent = element.getparent()
parent.remove(element)
else:
for tag in self:
self._copy(tag).before(value + (tag.tail or ''))
parent = tag.getparent()
parent.remove(tag)
return self
[docs] @with_camel_case_alias
def replace_all(self, expr):
"""replace nodes by expr
"""
if self._parent is no_default:
raise ValueError(
'replaceAll can only be used with an object with parent')
self._parent(expr).replace_with(self)
return self
[docs] def clone(self):
"""return a copy of nodes
"""
return PyQuery([deepcopy(tag) for tag in self])
[docs] def empty(self):
"""remove nodes content
"""
for tag in self:
tag.text = None
tag[:] = []
return self
[docs] def remove(self, expr=no_default):
"""Remove nodes:
>>> h = (
... '<div>Maybe <em>she</em> does <strong>NOT</strong> know</div>'
... )
>>> d = PyQuery(h)
>>> d('strong').remove()
[<strong>]
>>> print(d)
<div>Maybe <em>she</em> does know</div>
"""
if expr is no_default:
for tag in self:
parent = tag.getparent()
if parent is not None:
if tag.tail:
prev = tag.getprevious()
if prev is None:
if not parent.text:
parent.text = ''
parent.text += ' ' + tag.tail
else:
if not prev.tail:
prev.tail = ''
prev.tail += ' ' + tag.tail
parent.remove(tag)
else:
results = self._copy(expr, self)
results.remove()
return self
[docs] class Fn(object):
"""Hook for defining custom function (like the jQuery.fn):
.. sourcecode:: python
>>> fn = lambda: this.map(lambda i, el: PyQuery(this).outerHtml())
>>> PyQuery.fn.listOuterHtml = fn
>>> S = PyQuery(
... '<ol> <li>Coffee</li> <li>Tea</li> <li>Milk</li> </ol>')
>>> S('li').listOuterHtml()
['<li>Coffee</li>', '<li>Tea</li>', '<li>Milk</li>']
"""
def __setattr__(self, name, func):
def fn(self, *args, **kwargs):
func.__globals__['this'] = self
return func(*args, **kwargs)
fn.__name__ = name
setattr(PyQuery, name, fn)
fn = Fn()
########
# AJAX #
########
[docs] @with_camel_case_alias
def serialize_array(self):
"""Serialize form elements as an array of dictionaries, whose structure
mirrors that produced by the jQuery API. Notably, it does not handle
the deprecated `keygen` form element.
>>> d = PyQuery('<form><input name="order" value="spam"></form>')
>>> d.serialize_array() == [{'name': 'order', 'value': 'spam'}]
True
>>> d.serializeArray() == [{'name': 'order', 'value': 'spam'}]
True
"""
return list(map(
lambda p: {'name': p[0], 'value': p[1]},
self.serialize_pairs()
))
[docs] def serialize(self):
"""Serialize form elements as a URL-encoded string.
>>> h = (
... '<form><input name="order" value="spam">'
... '<input name="order2" value="baked beans"></form>'
... )
>>> d = PyQuery(h)
>>> d.serialize()
'order=spam&order2=baked%20beans'
"""
return urlencode(self.serialize_pairs()).replace('+', '%20')
#####################################################
# Additional methods that are not in the jQuery API #
#####################################################
[docs] @with_camel_case_alias
def serialize_pairs(self):
"""Serialize form elements as an array of 2-tuples conventional for
typical URL-parsing operations in Python.
>>> d = PyQuery('<form><input name="order" value="spam"></form>')
>>> d.serialize_pairs()
[('order', 'spam')]
>>> d.serializePairs()
[('order', 'spam')]
"""
# https://github.com/jquery/jquery/blob
# /2d4f53416e5f74fa98e0c1d66b6f3c285a12f0ce/src/serialize.js#L14
_submitter_types = ['submit', 'button', 'image', 'reset', 'file']
controls = self._copy([])
# Expand list of form controls
for el in self.items():
if el[0].tag == 'form':
form_id = el.attr('id')
if form_id:
# Include inputs outside of their form owner
root = self._copy(el.root.getroot())
controls.extend(root(
'#%s :not([form]):input, [form="%s"]:input'
% (form_id, form_id)))
else:
controls.extend(el(':not([form]):input'))
elif el[0].tag == 'fieldset':
controls.extend(el(':input'))
else:
controls.extend(el)
# Filter controls
selector = '[name]:enabled:not(button)' # Not serializing image button
selector += ''.join(map(
lambda s: ':not([type="%s"])' % s,
_submitter_types))
controls = controls.filter(selector)
def _filter_out_unchecked(_, el):
el = controls._copy(el)
return not el.is_(':checkbox:not(:checked)') and \
not el.is_(':radio:not(:checked)')
controls = controls.filter(_filter_out_unchecked)
# jQuery serializes inputs with the datalist element as an ancestor
# contrary to WHATWG spec as of August 2018
#
# xpath = 'self::*[not(ancestor::datalist)]'
# results = []
# for tag in controls:
# results.extend(tag.xpath(xpath, namespaces=controls.namespaces))
# controls = controls._copy(results)
# Serialize values
ret = []
for field in controls:
val = self._copy(field).val()
if isinstance(val, list):
ret.extend(map(
lambda v: (field.attrib['name'], v.replace('\n', '\r\n')),
val
))
else:
ret.append((field.attrib['name'], val.replace('\n', '\r\n')))
return ret
[docs] @with_camel_case_alias
def serialize_dict(self):
"""Serialize form elements as an ordered dictionary. Multiple values
corresponding to the same input name are concatenated into one list.
>>> d = PyQuery('''<form>
... <input name="order" value="spam">
... <input name="order" value="eggs">
... <input name="order2" value="ham">
... </form>''')
>>> d.serialize_dict()
OrderedDict([('order', ['spam', 'eggs']), ('order2', 'ham')])
>>> d.serializeDict()
OrderedDict([('order', ['spam', 'eggs']), ('order2', 'ham')])
"""
ret = OrderedDict()
for name, val in self.serialize_pairs():
if name not in ret:
ret[name] = val
elif not isinstance(ret[name], list):
ret[name] = [ret[name], val]
else:
ret[name].append(val)
return ret
@property
def base_url(self):
"""Return the url of current html document or None if not available.
"""
if self._base_url is not None:
return self._base_url
if self._parent is not no_default:
return self._parent.base_url
[docs] def make_links_absolute(self, base_url=None):
"""Make all links absolute.
"""
if base_url is None:
base_url = self.base_url
if base_url is None:
raise ValueError((
'You need a base URL to make your links'
'absolute. It can be provided by the base_url parameter.'))
def repl(attr):
def rep(i, e):
attr_value = self(e).attr(attr)
# when label hasn't such attr, pass
if attr_value is None:
return None
# skip specific "protocol" schemas
if any(attr_value.startswith(schema)
for schema in ('tel:', 'callto:', 'sms:')):
return None
return self(e).attr(attr,
urljoin(base_url, attr_value.strip()))
return rep
self('a').each(repl('href'))
self('link').each(repl('href'))
self('script').each(repl('src'))
self('img').each(repl('src'))
self('iframe').each(repl('src'))
self('form').each(repl('action'))
return self
build_camel_case_aliases(PyQuery)