==Beautiful Soup (HTML XML解析)==
[[Python]]
*http://www.crummy.com/software/BeautifulSoup/
import BeautifulSoup # To get everything
<<blockquote>>No module named BeautifulSoup エラーとなる場合<</blockquote>>
from bs4 import BeautifulSoup # To get everything
==解析==
*文字列およびファイルハンドルによる文書解析
soup = BeautifulSoup(open("index.html"))
soup = BeautifulSoup("<<html>>data<</html>>")
*URLを指定して解析
*もっとも知られているのが、class
*Beautiful Soupでは、リストとして扱う
css_soup = BeautifulSoup('<<p class="body strikeout"><></p>>')
css_soup.p['class']
# ["body", "strikeout"]
*XMLでは、複数値として扱わない
xml_soup = BeautifulSoup('<<p class="body strikeout"><></p>>', 'xml')
xml_soup.p['class']
# u'body strikeout'
# u'Extremely bold'
type(tag.string)
# <<class 'bs4.element.NavigableString'>>
*unicode() で、Unicode 文字列に変換できる
unicode_string = unicode(tag.string)
# u'Extremely bold'
type(unicode_string)
# <<type 'unicode'>>
===BeautifulSoup===
*文書全体を表す
===コメント===
*Commentオブジェクトは、NavigableStringの特殊型
markup = "<<b><><!--Hey, buddy. Want to buy a used parser?--><></b>>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
# <<class 'bs4.element.Comment'>>
==treeのナビゲート==
===下がる===
====.contents と .children====
head_tag.contents
[<<title>>The Dormouse's story<</title>>]
for child in title_tag.children:
for child in head_tag.descendants:
print(child)
# <<title>>The Dormouse's story<</title>>
# The Dormouse's story
====.string====
title_tag = soup.title
title_tag
# <<title>>The Dormouse's story<</title>>
title_tag.parent
# <<head><><title>>The Dormouse's story<</title><></head>>
====.paretns====
link = soup.a
link
# <<a class="sister" href="http://example.com/elsie" id="link1">>Elsie<</a>>
for parent in link.parents:
if parent is None:
====.next_sibling と .prebious_sibling====
sibling_soup.b.next_sibling
# <<c>>text2<</c>>
sibling_soup.c.previous_sibling
# <<b>>text1<</b>>
====.next_siblings と .prebious_siblings====
for sibling in soup.a.next_siblings:
*以下はほぼ同等
soup.find_all('title', limit=1)
# [<<title>>The Dormouse's story<</title>>]
soup.find('title')
# <<title>>The Dormouse's story<</title>>
=====CSSクラスで検索=====
soup.find("b", { "class" : "lime" })
# <<b class="lime">>Lime<</b>>
====find_parents() と find_parent()====
====find_next_siblings() と find_next_sibling()====
paraText = soup.find(text='This is paragraph ')
paraText.findNextSiblings('b')
# [<<b>>one<</b>>]
paraText.findNextSibling(text = lambda(text): len(text) == 1)
soup.select("body a")
====直下のタグ====
soup.select("head > > title")
====CSS class====
soup.select(".sister")
tag.string = "New link text."
===append()===
soup = BeautifulSoup("<<a>>Foo<</a>>")
soup.a.append("Bar")
===BeautifulSoup.new_string() と .new_tag()===
====new_string()====
soup = BeautifulSoup("<<b><></b>>")
tag = soup.b
tag.append("Hello")
tag.append(new_string)
tag
# <<b>>Hello there.<</b>>
tag.contents
# [u'Hello', u' there']
====new_tag()====
soup = BeautifulSoup("<<b><></b>>")
original_tag = soup.b
original_tag.append(new_tag)
original_tag
# <<b><><a href="http://www.example.com"><></a><></b> >
new_tag.string = "Link text."
original_tag
# <<b><><a href="http://www.example.com">>Link text.<</a><></b>>
===insert()===
markup = '<<a href="http://example.com/">>I linked to <<i>>example.com<</i><></a>>'
soup = BeautifulSoup(markup)
tag = soup.a
tag.insert(1, "but did not endorse ")
tag
# <<a href="http://example.com/">>I linked to but did not endorse <<i>>example.com<</i><></a>>
tag.contents
# [u'I linked to ', u'but did not endorse', <<i>>example.com<</i>>]
===insert_before() と insert_after()===
===clear()===
markup = '<<a href="http://example.com/">>I linked to <<i>>example.com<</i><></a>>'
soup = BeautifulSoup(markup)
tag = soup.a
tag.clear()
tag
# <<a href="http://example.com/"><></a>>
===extract()===
*タグもしくは文字列をツリーから削除
markup = '<<a href="http://example.com/">>I linked to <<i>>example.com<</i><></a>>'
soup = BeautifulSoup(markup)
a_tag = soup.a
a_tag
# <<a href="http://example.com/">>I linked to<</a>>
i_tag
# <<i>>example.com<</i>>
===decompose()===
*タグをツリーから取り除く
markup = '<<a href="http://example.com/">>I linked to <<i>>example.com<</i><></a>>'
soup = BeautifulSoup(markup)
a_tag = soup.a
a_tag
# <<a href="http://example.com/">>I linked to<</a>>
===replace_with()===
*タグおよび文字列をツリーから取り除き、別のタグおよび文字列に置き換える
===wrap()===
*要素をタグでラップする
soup = BeautifulSoup("<<p>>I wish I was bold.<</p>>")
soup.p.string.wrap(soup.new_tag("b"))
# <<b>>I wish I was bold.<</b>>
===unwrap()===
*タグをはがす
markup = '<<a href="http://example.com/">>I linked to <<i>>example.com<</i><></a>>'
soup = BeautifulSoup(markup)
a_tag = soup.a
a_tag.i.unwrap()
a_tag
# <<a href="http://example.com/">>I linked to example.com<</a>>
==出力==
===Pretty-printing===