通过斗鱼给出的api,获取json文件,解析出图片地址,可以获取直播间的图片 斗鱼api接口:
http://open.douyucdn.cn/api/RoomApi/live/{num}
比如: http://open.douyucdn.cn/api/RoomApi/live/1
当然也可以用这个获取好多妹子的图片,当然也有小哥哥 http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=
项目文件为spider.py、item.py、pipeline.py
spider.py
# -*- coding: utf-8 -*-
# 通过接口,爬取斗鱼图片保存本地
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import scrapy
import os
from douyu_item import DouyuItem
import json
# 图片存储位置,也可以在 settings.py 中统一配置
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
IMAGES_STORE = os.path.join(BASE_DIR, "images")
class DouyuSpider(scrapy.Spider):
name = "douyu"
allowed_domains = ["douyucdn.cn"]
# 设置 图片存储位置 和 处理数据的管道文件
custom_settings = {
"IMAGES_STORE": IMAGES_STORE,
"ITEM_PIPELINES":{
"myspider.douyu_spider.douyu_pipeline.DouyuPipeline": 100,
}
}
# 接口
# base_url = "http://open.douyucdn.cn/api/RoomApi/live/"
base_url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
offset = 0
start_urls = [base_url + str(offset)]
def parse(self, response):
# 解析json文件
datas = json.loads(response.body).get("data")
# 停止条件
if datas == []:
print "已经没有图片啦"
return
for data in datas:
room_src = data.get("room_src")
room_name = data.get("room_name")
item = DouyuItem()
item["img_name"] = room_name
item["img_link"] = room_src
yield item
# 继续下一页
self.offset += 20
yield scrapy.Request(self.base_url + str(self.offset))
item.py
# -*- coding: utf-8 -*-
import scrapy
class DouyuItem(scrapy.Item):
img_name = scrapy.Field() # 图片名称
img_link = scrapy.Field() # 图片链接
pipeline.py
# -*- coding: utf-8 -*-
import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline
from myspider.douyu_spider.douyu_spider import images_path
class DouyuPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 返回需要下载的图片链接
img_link = item.get("img_link")
yield scrapy.Request(img_link)
def item_completed(self, results, item, info):
# 将下载完成的图片重命名
print results
"""
[(True,
{'url': 'https://rpic.douyucdn.cn/amrpic-180422/4475021_1048.jpg',
'path': 'full/aa6df7582a33bbe025ec0e3ebd21ff133aa56b36.jpg',
'checksum': 'bc89354a577ee6cf22a7d065859bc990'})]
"""
image_path = [url["path"] for ok, url in results if ok]
odl_path = os.path.join(images_path, image_path[0])
new_path = os.path.join(images_path, item["img_name"]+".jpg")
# 重命名
os.rename(odl_path, new_path)
return item