Skip to content

Commit e484692

Browse files
committed
bs4
1 parent 36525cb commit e484692

File tree

1 file changed

+39
-2
lines changed

1 file changed

+39
-2
lines changed

PythonStu/stu/HTML/BeautifulSoup_iter.py

100644100755
Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,55 @@
1111

1212
html_doc = """
1313
<html><head><title>The Dormouse's story</title></head>
14-
14+
<body>
1515
<p class="title"><b>The Dormouse's story</b></p>
1616
1717
<p class="story">Once upon a time there were three little sisters; and their names were
1818
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
1919
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
2020
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
2121
and they lived at the bottom of a well.</p>
22-
22+
<div id="body"><div id="python">this is python topic</div></div>
2323
<p class="story">...</p>
24+
</body>
25+
</html>
2426
"""
2527

2628
soup = BeautifulSoup(html_doc)
2729

2830

31+
#通过标签名获取html元素
32+
title = soup.title
33+
print title
34+
35+
#多级元素获取/获取body中的第一个b标签
36+
b = soup.body.p.b
37+
print b
38+
39+
#获取全部元素/获取页面所有的a标签
40+
list_a = soup.find_all('a')
41+
print type(list_a)
42+
for item in list_a:
43+
print item
44+
45+
#.contents属性可以将tag的子节点以列表的方式输出
46+
head_tag = soup.head
47+
print head_tag
48+
49+
print head_tag.contents[0]
50+
51+
#通过tag的.children属性堆tag的直接字节点进行遍历
52+
for child in soup.body.children:
53+
print child.name
54+
55+
#通过.descendants堆孙子节点进行遍历
56+
for child in soup.body.descendants:
57+
print child.name
58+
59+
#通过.strings输出文档中的文本
60+
for string in soup.strings:
61+
print repr(string)
62+
63+
#通过.stripped_strings输出文档中的非空格空行字符串文本
64+
for string in soup.stripped_strings:
65+
print repr(string)

0 commit comments

Comments
 (0)