# 项目介绍: 爬取今日头条街拍图片
# 思路:
# 1、通过观察列表页面,获取ajax的请求连接和参数
# 2、解析获取的json文件,获取图集连接
# 3、获取图集详情页
# 4、解析详情页,beautifulsoup解析出图集名称,re解析出图集连接
# 5、图集名称和图集链接存入mongodb
# 6、下载图片,并保存到本地文件夹
# 7、调试完成后,开启多进程执行整个过程
代码实现
import requests
import json
import re
import pymongo
import os
from hashlib import md5
from bs4 import BeautifulSoup
from multiprocessing import Pool
from requests.exceptions import RequestException
# 配置mongodb参数
client = pymongo.MongoClient("localhost", connect=False)
db = client["toutiao"]
# UserWarning: MongoClient opened before fork
def get_page_index(keyword, offset):
# 获取图集列表
url = "https://www.toutiao.com/search_content/"
parameters = {
"offset": offset,
"format": "json",
"keyword": keyword,
"autoload": "true",
"count": 20,
"cur_tab": 3,
"from": "gallery",
}
response = requests.get(url, params=parameters)
return response.text
def parse_page_index(text):
# 解析图片列表
result = json.loads(text, encoding="utf-8")
for item in result.get("data"):
yield "https://www.toutiao.com/a" + item.get("id")
def get_page_detail(url):
# 获取图片详细页面
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
return response.text
def parse_page_detail(html, url):
# 解析图片详细页面内容
# 解析出图集标题
soup = BeautifulSoup(html, features="lxml")
title = soup.find("title").get_text()
# 解析图集包含图片链接
pattern = re.compile('gallery: JSON\.parse\(\"(.*?)\"\),', re.S)
result = pattern.search(html)
if result:
ret = result.group(1).replace("\\", "")
try:
data = json.loads(ret)
sub_images = data.get("sub_images")
return {
"title": title,
"url": url,
"images": [x.get("url") for x in sub_images]
}
except json.decoder.JSONDecodeError:
print("json.decoder.JSONDecodeError")
def save_to_mongo(dct):
# 存入mongo数据库
try:
ret = db["toutiao"].insert(dct)
print("存入成功", ret)
except:
print("存入失败")
def download_image(url):
# 下载图片
print("下载图片:", url)
response = requests.get(url)
save_image(response.content)
def save_image(content):
# 保存图片
file_dir = "{0}/{1}".format(os.getcwd(), "images")
if not os.path.isdir(file_dir):
os.makedirs(file_dir)
path = os.path.join(file_dir, md5(content).hexdigest()+".jpg")
print("保存成功:", path)
if not os.path.exists(path):
with open(path, "wb") as f:
f.write(content)
def main(offset):
# 程序入口
text = get_page_index(offset=offset, keyword="街拍")
for url in parse_page_index(text):
text = get_page_detail(url)
result = parse_page_detail(text, url)
save_to_mongo(result)
if result:
for url in result.get("images"):
download_image(url)
if __name__ == "__main__":
# 开启多进程下载图片
pool = Pool()
pool.map(main, [x * 20 for x in range(10)])