1、BeautifulSoup 解析html如何使用
'''
想要学习Python?Python学习交流群:973783996满足你的需求,资料都已经上传群文件,可以自行下载!
'''
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import re
#待分析字符串
html_doc = """
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie
and
Tillie;
and they lived at the bottom of a well.
...
"""
# html字符串创建BeautifulSoup对象
soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
#输出第一个 title 标签
print soup.title
#输出第一个 title 标签的标签名称
print soup.title.name
#输出第一个 title 标签的包含内容
print soup.title.string
#输出第一个 title 标签的父标签的标签名称
print soup.title.parent.name
#输出第一个 p 标签
print soup.p
#输出第一个 p 标签的 class 属性内容
print soup.p['class']
#输出第一个 a 标签的 href 属性内容
print soup.a['href']
'''
soup的属性可以被添加,删除或修改. 再说一次, soup的属性操作方法与字典一样
'''
#修改第一个 a 标签的href属性为 http://www.baidu.com/
soup.a['href'] = 'http://www.baidu.com/'
#给第一个 a 标签添加 name 属性
soup.a['name'] = u'百度'
#删除第一个 a 标签的 class 属性为
del soup.a['class']
##输出第一个 p 标签的所有子节点
print soup.p.contents
#输出第一个 a 标签
print soup.a
#输出所有的 a 标签,以列表形式显示
print soup.find_all('a')
#输出第一个 id 属性等于 link3 的 a 标签
print soup.find(id="link3")
#获取所有文字内容
print(soup.get_text())
#输出第一个 a 标签的所有属性信息
print soup.a.attrs
for link in soup.find_all('a'):
#获取 link 的 href 属性内容
print(link.get('href'))
#对soup.p的子节点进行循环输出
for child in soup.p.children:
print(child)
#正则匹配,名字中带有b的标签
for tag in soup.find_all(re.compile("b")):
print(tag.name)
2、cookie等使用方法以及函数爬虫
参照: https://cuiqingcai.com/968.html
3、header,代理,超时,认证,异常处理
参照: http://blog.csdn.net/m_buddy/article/details/55193762
4、错误异常处理
1.URLError
# -*- coding: UTF-8 -*-
import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
if __name__ == "__main__":
#一个不存在的连接
url = "http://www.douyu.com/Jack_Cui.html"
request = urllib.request.Request(url)
try:
response = urllib.request.urlopen(request)
# html = responese.read()
except urllib.error.HTTPError as e:
print(e.code)
运行结果:
C:\Python34\python.exe G:/xiaoshuo2.py
Process finished with exit code 0
# -*- coding: UTF-8 -*-
import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
if __name__ == "__main__":
#一个不存在的连接
url = "http://www.douyu.com/Jack_Cui.html"
request = urllib.request.Request(url)
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
except urllib.error.HTTPError as e:
print(e.code)
运行结果:
C:\Python34\python.exe G:/xiaoshuo2.py
Process finished with exit code 0
import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
url = "http://www.douyu.com/Jack_Cui.html"
rep=urllib.request.Request(url)
try:
data=urllib.request.urlopen(rep)
except urllib.error.URLError as e:
if hasattr(e,'code'):
print("HTTPError")
print(e.code)
if hasattr(e,'reason' ):
print("URLError")
print(e.reason)
输出结果:
C:\Python34\python.exe G:/xiaoshuo2.py
HTTPError
URLError
Forbidden
Process finished with exit code 0
5、python打印防止换行和换行
https://www.cnblogs.com/kfx2007/p/5970784.html
实例:
# coding=utf-8
import re
language = '''''
jenkins
性別:
男d
異名:
(字) 翔宇
爱好:
篮球
籍貫:
广西省桂林市
'''
#获取table中tr值
res_tr = r'(.*?)'
m_tr = re.findall(res_tr,language,re.S|re.M)
for line in m_tr:
#获取表格第一列th 属性
res_th = r'(.*?)'
m_th = re.findall(res_th,line,re.S|re.M)
for mm in m_th:
if "href" in mm: #如果获取加粗的th中含超链接则处理
restr = r'(.*?)'
h = re.findall(restr,mm,re.S|re.M)
print (h[0],end=' ') #逗号连接属性值 防止换行
else:
print (mm,end=' ') #unicode防止乱
#获取表格第二列td 属性值
res_td = r'(.*?)' #r'(.*?)'
m_td = re.findall(res_td,line,re.S|re.M)
for nn in m_td:
if "href" in nn: #处理超链接
res_value = r'(.*?)'
m_value = re.findall(res_value,nn,re.S|re.M)
for value in m_value:
print (value,end=' ')
elif "span" in nn: #处理标签
res_value = r'(.*?)'
m_value = re.findall(res_value,nn,re.S|re.M) #(字) 翔宇
for value in m_value:
print (value,end=' ')
else:
print (nn,end=' ')
print (' ') #换行
C:\Python34\python.exe G:/xiaoshuo2.py
性別: 男
異名: (字) 翔宇
爱好: 篮球
籍貫: 广西省 桂林市
6、python打印如何呢不换行
https://www.cnblogs.com/hwd9654/p/5707920.html
# -*- coding:utf-8 -*-
import urllib
import re
#import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
class Tool:
removeImg = re.compile('| {7}|')
removeAddr = re.compile('|')
replaceLine = re.compile('|||')
replaceTD= re.compile('')
replacePara = re.compile('')
replaceBR = re.compile('|')
removeExtraTag = re.compile('')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
return x.strip()
class BDTB:
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
self.tool = Tool()
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request).read().decode("utf8")
#print (response)
return response
except urllib.error.URLError as e:
if hasattr(e,"reason"):
print ("连接百度贴吧失败,错误原因",e.reason)
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('
关注
打赏
最近更新
- 深拷贝和浅拷贝的区别(重点)
- 【Vue】走进Vue框架世界
- 【云服务器】项目部署—搭建网站—vue电商后台管理系统
- 【React介绍】 一文带你深入React
- 【React】React组件实例的三大属性之state,props,refs(你学废了吗)
- 【脚手架VueCLI】从零开始,创建一个VUE项目
- 【React】深入理解React组件生命周期----图文详解(含代码)
- 【React】DOM的Diffing算法是什么?以及DOM中key的作用----经典面试题
- 【React】1_使用React脚手架创建项目步骤--------详解(含项目结构说明)
- 【React】2_如何使用react脚手架写一个简单的页面?