lxml

lxml 是 libxml2 和 libxslt 库的 Pythonic、成熟绑定。 它使用 ElementTree API 提供对这些库的安全、便捷的访问。

它显着扩展了 ElementTree API,提供对 XPath、RelaxNG、XML Schema、XSLT、C14N 等的支持。Homepage, github.

etree

html_doc = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>

    <body>
        <div>
            <p>百里守约</p>
        </div>
        <div class="book">
            <div class="story">
                <ul>
                    <li><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a></li>
                    <li><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a></li>
                    <li><a href="http://example.com/tillie" class="sister" id="link3">Tillie</a></li>
                </ul>
                <ul>
                    <li><script name="name123" id="id123" class="class123" type="type123" style="style123">text123</script></li>
                    <li><script class="class222" ddd>text222</script></li>
                    <li><script class="class333" xxx>text333</script></li>
                </ul>
            </div>
        </div>
    </body>
</html>
"""

from __future__ import print_function

from lxml import etree

tree = etree.HTML(html_doc)

tree.xpath返回的是一个列表,/ 表示根节点开始定位,表示一个层级;//在开头表示从任意位置开始定位,表示多个层级。

示例:

tree.xpath('/html/body/div')
# [<Element div at 0x7fb83844ccf8>, <Element div at 0x7fb83844cb48>]

tree.xpath('/html//div')
# [<Element div at 0x7fb83844cb00>, <Element div at 0x7fb83844cc68>, <Element div at 0x7fb83844ccb0>]

tree.xpath('//body/div')
# [<Element div at 0x7fb83844ccf8>, <Element div at 0x7fb83844cb48>]

tree.xpath('//div')  #所有div
# [<Element div at 0x7fb83844cb00>, <Element div at 0x7fb83844cc68>, <Element div at 0x7fb83844ccb0>]

tree.xpath('//div[@class="book"]')  #属性定位
# [<Element div at 0x7fb83844cb48>]

tree.xpath('//div[@class="book"]/div/ul/li[2]')  #索引定位,从1开始
# [<Element li at 0x7fb8384682d8>, <Element li at 0x7fb838468320>]

tree.xpath('//div[@class="book"]/div/ul/li[2]/a/text()')[0]  #/text获取文本,不包含子标签
# 'Lacie'

tree.xpath('//div[@class="book"]/div/ul/li[2]//text()')[0]   #//text获取文本,包含子标签
# 'Lacie'

tree.xpath('//div[@class="book"]/div/ul/li[2]/a/@href')  #/@属性标签 获取属性值
# ['http://example.com/lacie']

li_list = tree.xpath('//div[@class="book"]/div/ul[1]/li')
# [<Element li at 0x7fb838468680>, <Element li at 0x7fb8384682d8>, <Element li at 0x7fb8384686c8>]

for li in li_list:
    r = li.xpath('./a//text()')[0]  # xpath 再调用 xpath
    print(r)
# Elsie
# Lacie
# Tillie

li.xpath('//div/ul/li/a | //div/ul/li/div/a')  # 多个
# [<Element a at 0x7fb83849b710>, <Element a at 0x7fb83844cc68>, <Element a at 0x7fb83849b7e8>]