前言
文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
作者: 我姓刘却留不住你的心
PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取
python免费学习资料以及群交流解答点击即可加入
最近找工作,爬虫面试的一个面试题。涉及的反爬还是比较全面的,结果公司要求高,要解决视频链接时效性问题,凉凉。
直接上代码:
import requests
import time
from datetime import datetime
import json
import execjs
import hashlib
import re
import csv
from zlib import crc32
from base64 import b64decode
import random
import urllib3
import os
import threading
from queue import Queue
from lxml import etree
# 查看js版本信息
# print(execjs.get().name)
# 屏蔽ssl验证警告
urllib3.disable_warnings()
"""
需要nodejs环境,需要修改subprocess.py文件内的class Popen(object)类中的__init__(..encode='utf-8)否则调用js文件时会报错
请求列表页时.py文件中的ua头要与js文件中一致,不然很难请求到数据,请求详情页时要用ua池否则会封浏览器/ip
会有一些空白表格,是因为该账号七天内为发表内容,或者该账号被封禁
输出结果在此文件所在根目录下/toutiao/
右键运行此py文件,newsign.js文件,toutiao.csv文件需在同一文件夹内
爬取的视频有时效性
"""
# 定义ua池
def headers():
# 各种PC端
user_agent_list = [
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
UserAgent = random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
headers_a = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
}
# 代理ip
proxy = {
'http': '183.57.44.62:808'
}
# cookies值
cookies = {'s_v_web_id': 'b68312370162a4754efb0510a0f6d394'}
# 获取_signature
def get_signature(user_id, max_behot_time):
with open('newsign.js', 'r', encoding='utf-8') as f:
jsData = f.read()
execjs.get()
ctx = execjs.compile(jsData).call('tac', str(user_id) + str(
max_behot_time)) # 复原TAC.sign(userInfo.id + "" + i.param.max_behot_time)
return ctx
# 获取as,cp
def get_as_cp(): # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.js
zz = {}
now = round(time.time())
# print(now) # 获取当前计算机时间
e = hex(int(now)).upper()[2:] # hex()转换一个整数对象为16进制的字符串表示
# print('e:', e)
a = hashlib.md5() # hashlib.md5().hexdigest()创建hash对象并返回16进制结果
# print('a:', a)
a.update(str(int(now)).encode('utf-8'))
i = a.hexdigest().upper()
# print('i:', i)
if len(e) != 8:
zz = {'as': '479BB4B7254C150',
'cp': '7E0AC8874BB0985'}
return zz
n = i[:5]
a = i[-5:]
r = ''
s = ''
for i in range(5):
s = s + n[i] + e[i]
for j in range(5):
r = r + e[j + 3] + a[j]
zz = {
'as': 'A1' + s + e[-3:],
'cp': e[0:3] + r + 'E1'
}
# print('zz:', zz)
return zz
# 获取as,cp,_signature(弃用)
def get_js():
f = open(r"juejin.js", 'r', encoding='UTF-8') ##打开JS文件
line = f.readline()
htmlstr = ''
while line:
htmlstr = htmlstr + line
line = f.readline()
ctx = execjs.compile(htmlstr)
return ctx.call('get_as_cp_signature')
# print(json.loads(get_js())['as'])
# 文章数据
break_flag = []
def wenzhang(url=None, max_behot_time=0, n=0, csv_name=0):
max_qingqiu = 50
headers1 = ['发表时间', '标题', '来源', '所有图片', '文章内容']
first_url = 'https://www.toutiao.com/c/user/article/?page_type=1&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s' % (
url.split('/')[-2], max_behot_time, get_as_cp()['as'], get_as_cp()['cp'],
get_signature(url.split('/')[-2], max_behot_time))
while n 8:
return None
row = {'发表时间': article_time[0], '标题': article_title[0].strip('"'), '来源': article_source[0],
'文章内容': article_content.strip()}
with open('/toutiao/' + str(csv_name) + '文章.csv', 'a', newline='')as f:
f_csv = csv.DictWriter(f, headers1)
# f_csv.writeheader()
f_csv.writerow(row)
print('正在爬取文章:', article_title[0].strip('"'), article_time[0], url)
time.sleep(0.5)
return 'ok'
# 视频数据
break_flag_video = []
def shipin(url, max_behot_time=0, csv_name=0, n=0):
max_qingqiu = 20
headers2 = ['视频发表时间', '标题', '来源', '视频链接']
first_url = 'https://www.toutiao.com/c/user/article/?page_type=0&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s' % (
url.split('/')[-2], max_behot_time, get_as_cp()['as'], get_as_cp()['cp'],
get_signature(url.split('/')[-2], max_behot_time))
while n
关注
打赏
热门博文
- Python骚操作,实现驾考自动答题,这就直接满分了?
- 用Python自动实现图表可视化操作,提高工作效率,又能有更多的时间摸鱼了~
- Python:用tkinter制做一个音乐下载小软件
- Python丨小学妹喜欢看漫画,于是我写了四十行代码获取了它所有漫画
- 女同桌找我要表情包,还好我会Python,分分钟给她下载几十个G...
- 为了防止这上面的文章被封,我连夜用Python获取了它所有内容,真香~
- 这个Python读取文件的方法,堪称天花板级别...
- Python做一个通过输入bv号就能下载视频的工具,评论和弹幕也不放过
- Python爬虫何如抓包?这三个案例手把手教会你,非常详细...
- Python:50行代码实现下载小说,图片章节可自动识别转文字保存...