lxml 是 libxml2 和 libxslt 库的 Pythonic、成熟绑定。 它使用 ElementTree API 提供对这些库的安全、便捷的访问。
它显着扩展了 ElementTree API,提供对 XPath、RelaxNG、XML Schema、XSLT、C14N 等的支持。Homepage, github.
html_doc = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<div>
<p>百里守约</p>
</div>
<div class="book">
<div class="story">
<ul>
<li><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a></li>
<li><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a></li>
<li><a href="http://example.com/tillie" class="sister" id="link3">Tillie</a></li>
</ul>
<ul>
<li><script name="name123" id="id123" class="class123" type="type123" style="style123">text123</script></li>
<li><script class="class222" ddd>text222</script></li>
<li><script class="class333" xxx>text333</script></li>
</ul>
</div>
</div>
</body>
</html>
"""
from __future__ import print_function
from lxml import etree
tree = etree.HTML(html_doc)
tree.xpath返回的是一个列表,/ 表示根节点开始定位,表示一个层级;//在开头表示从任意位置开始定位,表示多个层级。
示例:
tree.xpath('/html/body/div')
# [<Element div at 0x7fb83844ccf8>, <Element div at 0x7fb83844cb48>]
tree.xpath('/html//div')
# [<Element div at 0x7fb83844cb00>, <Element div at 0x7fb83844cc68>, <Element div at 0x7fb83844ccb0>]
tree.xpath('//body/div')
# [<Element div at 0x7fb83844ccf8>, <Element div at 0x7fb83844cb48>]
tree.xpath('//div') #所有div
# [<Element div at 0x7fb83844cb00>, <Element div at 0x7fb83844cc68>, <Element div at 0x7fb83844ccb0>]
tree.xpath('//div[@class="book"]') #属性定位
# [<Element div at 0x7fb83844cb48>]
tree.xpath('//div[@class="book"]/div/ul/li[2]') #索引定位,从1开始
# [<Element li at 0x7fb8384682d8>, <Element li at 0x7fb838468320>]
tree.xpath('//div[@class="book"]/div/ul/li[2]/a/text()')[0] #/text获取文本,不包含子标签
# 'Lacie'
tree.xpath('//div[@class="book"]/div/ul/li[2]//text()')[0] #//text获取文本,包含子标签
# 'Lacie'
tree.xpath('//div[@class="book"]/div/ul/li[2]/a/@href') #/@属性标签 获取属性值
# ['http://example.com/lacie']
li_list = tree.xpath('//div[@class="book"]/div/ul[1]/li')
# [<Element li at 0x7fb838468680>, <Element li at 0x7fb8384682d8>, <Element li at 0x7fb8384686c8>]
for li in li_list:
r = li.xpath('./a//text()')[0] # xpath 再调用 xpath
print(r)
# Elsie
# Lacie
# Tillie
li.xpath('//div/ul/li/a | //div/ul/li/div/a') # 多个
# [<Element a at 0x7fb83849b710>, <Element a at 0x7fb83844cc68>, <Element a at 0x7fb83849b7e8>]