【问题】在京东网站搜索java图书,利用python爬取在搜索结果中前4页的图书信息的图片,并保存下来。
#导入请求、报错模块&正则表达式类库
import urllib
import requests
import re
key_name = "java"
for i in range(1,5):
#定义函数,将爬到的每一页的商品url写入到文件
url = "https://search.jd.com/Search?keyword=" + key_name +"&enc=utf-8&page="+str(i)
print(url)
# 拿到每页源码
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
data1 = requests.get(url,headers =headers).content.decode()
# with open("./1.html","w") as f:
# f.write(data1)
#定义匹配规则
pat = 'src="//(.*?).jpg"'
#匹配到的所有图片url
img_url = re.compile(pat).findall(data1)
print(img_url)
# #内层for循环将所有图片写到本地
for a_i in range(0,len(img_url)):
this_img = img_url[a_i]
this_img_url = "http://"+this_img + ".jpg"
#每张图片的url
print(this_img_url)
img_path = "./imagetb" + str(a_i)+".jpg"
urllib.request.urlretrieve(this_img_url,img_path)