获取二进制的文本内容
import requests
url='''https://gss0.bdstatic.com/5bVWsj_p_tVS5dKfpU_Y_D3/res/r/image/2
019-09-26/a64fec2c10cfffd46f24eb793692971b.png'''
response = requests.get(url)
print(response.status_code) #获取相应的状态码
print(response.text) #响应的文本(字符串)
print(response.apparent_encoding) #响应体的编码格式
print(response.encoding) #响应头的编码格式
print(response.content) #响应的(二进制):图片、视频等
head=response.request.headers
print(head) #获取请求头信息
response = requests.get(url,headers=head)
print(response.status_code)
pic_name = url.split('/')[-1] #去最后一个为文件名
with open(pic_name, 'wb+') as f:
f.write(response.content)
利用头文件,爬进豆瓣网
import requests
import re
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116Safari/537.36"}
response =requests.get("https://movie.douban.com/",headers=headers)
print(response.status_code)
获取二进制(图片,视频等)
import requests
r = requests.get('https://img2.doubanio.com/view/photo/l/public/p2623301112.webp')
print (r.text)
print (r.content)
相应对象反序列化json()方法
import requests
head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"}
r =requests.get("https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start=20",headers=head)
print (f"响应对象反序列化类型为{type(r.json())}")
爬取网页的代码框架
import requests
def url_search(url):
hearders={"user-agent":"Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/83.0.4103.61 Safari/537.36"}
try:
response =requests.get(url,headers=headers)
global codesxx
codesxx = response.status_code
response.raise_for_status()
response.encoding = response.apparent_encoding
print(response.text)
except:
print('爬虫失败,状态码为{}'.format(codesxx))