1 #!/usr/bin/python 2 # coding=utf-8 3 4 from bs4 import BeautifulSoup 5 import re 6 7 html_doc = """ 8The Dormouse's story 9 10The Dormouse's story
11 12Once upon a time there were three little sisters; and their names were13 Elsie,14 Lacie and15 Tillie;16 and they lived at the bottom of a well.
17 18...
19 """20 21 soup = BeautifulSoup(html_doc,'html.parser',from_encoding = 'utf-8')22 23 print '获取所有的链接'24 links = soup.find_all('a')25 for link in links:26 print link.name, link['href'],link.get_text()27 28 print '获取lacie的链接'29 link_node = soup.find('a',href='http://example.com/lacie')30 print link_node.name, link_node['href'],link_node.get_text()31 32 print '正则匹配 ill'33 #r"" ,字符串中反斜线 只用写一次34 link_node = soup.find('a',href=re.compile(r"ill") ) 35 print link_node.name, link_node['href'],link_node.get_text()36 37 print '获取p段落文字'38 #r"" ,字符串中反斜线 只用写一次39 p_node = soup.find('p',class_="title" ) 40 print p_node.name, p_node.get_text()
结果:
获取所有的链接a http://example.com/elsie Elsiea http://example.com/lacie Laciea http://example.com/tillie Tillie获取lacie的链接a http://example.com/lacie Lacie正则匹配 illa http://example.com/tillie Tillie获取p段落文字p The Dormouse's story