lxmlを使ってみる

とりあえず動かしてみる

>>> html = lxml.html.fromstring("""
... <html>
... <head><title>ていちゅらのテストページ</title></head>
... <body>
... <h1 id="main">this is h1</h1>
... <h2>i am h2 - 1</h2>
... <h2>i am h2 - 2</h2>
... </body>
... </html>
... """)
>>>
>>> html
<Element html at 0x7f7798b7c9a8>
>>>
>>> h1 = html.cssselect('h1')
>>> h1
[<Element h1 at 0x7f7798b4f7c8>]
>>> h2 = html.cssselect('h2')
>>> h2
[<Element h2 at 0x7f7798b4f7c8>, <Element h2 at 0x7f7798b7c7c8>]
>>>
>>> h2[0].tag
'h2'
>>> h2[0].text
'i am h2 - 1'
>>> h2[1].tag
'h2'
>>> h2[1].text
'i am h2 - 2'
>>>
>>> h1 = html.xpath('//h1')[0]
>>> h1
<Element h1 at 0x7f7798b4f7c8>
>>> h1.tag
'h1'
>>> h1.text
'this is h1'
>>> h1.get('id')
'main'
>>> h1.attrib
{'id': 'main'}
>>> h1.getparent()
<Element body at 0x7f77a198c188>
>>> h1.getparent().tag
'body'

コメント

タイトルとURLをコピーしました