Python HTML解析库Beautiful Soup

Python 专栏收录该内容
431 篇文章 2 订阅

简介

Beautiful Soup 是 Python 的 HTML/XML 解析器,可以很好地处理不规范标记并生成剖析树(parse tree)。

Beautiful Soup 提供简单实用的导航,搜索以及修改剖析树的操作,大大节省编程时间。

本文代码




安装

pip install lxml beautifulsoup4




初试

测试页面

<html>
<head><title>Page title</title></head>
<body>
<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
</body>
</html>

长这样
在这里插入图片描述
代码

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.contents[0].name)  # 第一个节点的名字
# 'html'
print(soup.contents[0].contents[0].name)  # 第一个节点的第一个节点的名字
# 'head'

head = soup.contents[0].contents[0]
print(head.parent.name)  # 父节点
# 'html'

print(head.next)  # 下一个节点
# <title>Page title</title>

print(head.nextSibling.name)  # 下一个兄弟节点的名字
# 'body'

print(head.nextSibling.contents[0])
# <p id="firstpara" align="center">This is paragraph <b>one</b>.</p>

print(head.nextSibling.contents[0].nextSibling)
# <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>




搜索标签和属性

  1. .:取节点
  2. .string:取内容
  3. bs4.BeautifulSoup('xxx'):查找标签
import re
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

titleTag = soup.html.head.title  # 取节点
print(titleTag)
# <title>Page title</title>

print(titleTag.string)  # 取内容
# 'Page title'

print(soup('p'))  # 查找标签
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll('p', align="center"))  # 指定属性查找所有,相当于soup('p', align="center")
# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]
print(soup('p', align="center"))  # 同上

print(soup.find('p', align="center"))  # 只查找一个
# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>

print(soup('p', align="center")[0]['id'])  # 取出id
# 'firstpara'

print(soup.find('p', align=re.compile('^b.*'))['id'])  # 查找align为'b'开头的元素
# 'secondpara'

print(soup.find('p').b.string)  # p元素 → b元素的内容
# 'one'

print(soup('p')[1].b.string)  # 所有p元素 → 第二个 → b元素的内容
# 'two'




导航

属性含义
parent父节点
contents子节点
string字符串内容
nextSibling下一个兄弟节点
previousSibling上一个兄弟节点
next下一层处理次序
previous上一层处理次序
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.head.parent)  # 父节点
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>

print(soup.head.contents)  # 子节点
print(soup.p.contents)  # 子节点
# [<title>Page title</title>]
# ['This is paragraph ', <b>one</b>, '.']

print(soup.b.string)  # 字符串内容
# one

print(soup.head.nextSibling)  # 下一个兄弟节点
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>

print(soup.body.previousSibling)  # 上一个兄弟节点
# <head><title>Page title</title></head>

print(soup.head.next)  # 下一层处理次序
print(soup.head.next.next)  # 下一层处理次序
print(soup.head.next.next.next)  # 下一层处理次序
# <title>Page title</title>
# Page title
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>

print(soup.head.previous)  # 上一层处理次序
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>




搜索

方法含义
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)所有匹配元素
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)第一个匹配元素
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)后面所有匹配兄弟节点
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs)后面第一个匹配兄弟节点
def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)前面所有匹配兄弟节点
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs)前面第一个匹配兄弟节点
def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)下层所有匹配元素
def findNext(self, name=None, attrs={}, text=None, **kwargs)下层第一个匹配元素
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs)上层所有匹配元素
def findPrevious(self, name=None, attrs={}, text=None, **kwargs)上层第一个匹配元素
def findParents(self, name=None, attrs={}, limit=None, **kwargs)所有匹配父节点
def findParent(self, name=None, attrs={}, **kwargs)第一个匹配父节点

1. 所有匹配

def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)

import re
from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.findAll('b'))  # 提取所有匹配元素
# [<b>one</b>, <b>two</b>]

print(soup.findAll(re.compile('^b')))  # 以b开头
# [<body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]

print(soup.findAll(['title', 'p']))  # title和p
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll({'title': True, 'p': True}))  # 同上,更快
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(lambda tag: len(tag.attrs) == 2))  # 传一个返回布尔值的callable对象
print(soup.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs))  # 单个字符的标签名且无属性
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<b>one</b>, <b>two</b>]

print(soup.findAll(align="center"))  # 指定属性筛选
print(soup.findAll(id=re.compile("para$")))  # 可以传字符串,正则表达式,列表,哈希表
print(soup.findAll(align=["center", "blah"]))
print(soup.findAll(align=lambda value: value and len(value) < 5))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(align=True))  # 匹配有align属性的元素
print(soup.findAll(align=None))  # 匹配无align属性的元素
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <head><title>Page title</title></head>, <title>Page title</title>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]

print(soup.findAll(id=re.compile("para$")))  # 与保留字有冲突时使用attrs参数,传入字典
print(soup.findAll(attrs={'id': re.compile("para$")}))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

print(soup.findAll(text="one"))  # 匹配内容
print(soup.findAll(text=["one", "two"]))  # 可以传字符串,正则表达式,列表,哈希表
print(soup.findAll(text=re.compile("paragraph")))
print(soup.findAll(text=True))
print(soup.findAll(text=lambda x: len(x) < 12))
# ['one']
# ['one', 'two']
# ['This is paragraph ', 'This is paragraph ']
# ['Page title', 'This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# ['Page title', 'one', '.', 'two', '.']

print([tag.name for tag in soup.html.findAll()])  # 默认递归遍历
print([tag.name for tag in soup.html.findAll(recursive=False)])  # 不递归遍历
# ['head', 'title', 'body', 'p', 'b', 'p', 'b']
# ['head', 'body']

print(soup.findAll('p', limit=1))  # 最大匹配个数
print(soup.findAll('p', limit=100))  #
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]

2. 第一个匹配

def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('p'))  # 提取第一个匹配元素
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>

3. 兄弟节点

所有兄弟节点

def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)

一个兄弟节点

def findNextSibling(self, name=None, attrs={}, text=None, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find(text='This is paragraph ').findNextSiblings('b'))  # 后面所有匹配兄弟节点
print(soup.find(text='This is paragraph ').findNextSibling(text=lambda text: len(text) == 1))  # 后面第一个匹配兄弟节点
print(soup.find(text='.').findPreviousSiblings('b'))  # 前面所有匹配兄弟节点
print(soup.find(text='.').findPreviousSibling(text=True))  # 前面第一个匹配兄弟节点
# [<b>one</b>]
# .
# [<b>one</b>]
# This is paragraph 

4. 上下层

下层所有匹配元素

def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)

下层第一个匹配元素

def findNext(self, name=None, attrs={}, text=None, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('p').findAllNext(text=True))  # 下层所有含text的元素
print(soup.find('p').findNext('p'))  # 第一个p的下一个p
print(soup.find('p').findNext('b'))  # 第一个p的下一个b
# ['This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>
# <b>one</b>

print(soup('p')[-1].findAllPrevious(text=True))  # 上层所有含text的元素
print(soup('p')[-1].findPrevious('p'))
print(soup('p')[-1].findPrevious('b'))
# ['.', 'one', 'This is paragraph ', 'Page title']
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>
# <b>one</b>

5. 父节点

所有匹配父节点

def findParents(self, name=None, attrs={}, limit=None, **kwargs)

第一个匹配父节点

def findParent(self, name=None, attrs={}, **kwargs)

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器

print(soup.find('b').findParents())  # 所有匹配父节点
print(soup.find('b').findParent('body'))  # 第一个匹配父节点
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>]
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>




查找class

传参 class_

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p class="firstpara" align="center">This is paragraph <b>one</b>.</p><p class="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')  # 使用lxml解析器
print(soup(class_='firstpara'))
# [<p align="center" class="firstpara">This is paragraph <b>one</b>.</p>]




美化

prettify()

from bs4 import BeautifulSoup

content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')
print(soup.prettify())




解析表格

使用 PrettyTable

pip install prettytable
from bs4 import BeautifulSoup
from itertools import zip_longest
from prettytable import PrettyTable

html = '''<html><body>
<table border="1">
  <tr>
    <th>学号</th>
    <th>姓名</th>
  </tr>
  <tr>
    <td>1</td>
    <td>张三</td>
  </tr>
  <tr>
    <td>2</td>
    <td>李四</td>
  </tr>
  <tr>
    <td>3</td>
    <td>王五</td>
  </tr>
</table>
</body>
</html>
'''

soup = BeautifulSoup(html, 'lxml')
th = soup('th')  # 表头
th = [i.string for i in th]
td = soup('td')  # 单元格
td = [i.string for i in td]
td = list(zip_longest(*([iter(td)] * len(th))))  # 根据th的长度分组
print(th)
print(td)

x = PrettyTable()
x.field_names = th  # 表头
for i in td:
    x.add_row(i)  # 添加一行数据
print(x)
# ['学号', '姓名']
# [('1', '张三'), ('2', '李四'), ('3', '王五')]
# +------+------+
# | 学号 | 姓名 |
# +------+------+
# |  1   | 张三 |
# |  2   | 李四 |
# |  3   | 王五 |
# +------+------+

推荐阅读:Python表格美化库PrettyTable中文文档




修改树

推荐阅读:修改剖析树




删除特定class或id

from bs4 import BeautifulSoup

content = '''<html>
<body>
<div id="first" class="d"><p>1</p></div>
<div id="second" class="d"><p>2</p></div>
</body>
</html>'''
soup = BeautifulSoup(content, 'lxml')
for div in soup.find_all('div', {'class': 'd'}):
    div.decompose()
print(soup.prettify())
# <html>
#  <body>
#  </body>
# </html>




报错 bs4.FeatureNotFound: Couldn’t find a tree builder with the features you requested

pip install wheel
pip install -U lxml




参考文献

  1. Beautiful Soup: We called him Tortoise because he taught us.
  2. Beautiful Soup 官方文档
  3. Beautiful Soup 中文文档
  4. BeautifulSoup库报错:bs4.FeatureNotFound: Couldnt find a tree builder with the features you requested
  5. Python BeautifulSoup删除具有特定类(class)的div
  • 0
    点赞
  • 0
    评论
  • 0
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

打赏
文章很值,打赏犒劳作者一下
相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页

打赏

XerCis

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值