废话少说,直接上代码。
这是成果:
from selenium import webdriver # webdriver 的驱动程序
from selenium.webdriver.common.by import By # 提取数据
from selenium.webdriver import ChromeOptions # chromeOptions 是一个配置 chrome 启动是属性的类。通过这个类,我们可以为chrome配置参数
from selenium.webdriver.support.wait import WebDriverWait # 显示等待
from selenium.webdriver.support import expected_conditions as EC
import time # 导入time
import csv # 导入csv
import os.path # 创建文件夹
import requests # 请求网页数据
import re # 正则匹配 文字匹配
driver = webdriver.Chrome()
WAIT = WebDriverWait(driver, 10) # 等待
def get_serch(url):
try:
# 发送请求
driver.get(url=url)
time.sleep(1)
WAIT.until(EC.presence_of_element_located((By.LINK_TEXT,'分类'))).click()
time.sleep(1)
return WAIT
except:
print("erro")
def get_data(WAIT):
all_list = []
n = 15 # 设置点击加载更多的次数
for page in range(n):
WAIT.until(EC.presence_of_element_located((By.LINK_TEXT,'加载更多'))).click()
print(f'==============================正在点击第{page}页的数据内容==============================')
time.sleep(5)
# 找到所有的a标签
all_data = WAIT.until(EC.presence_of_all_elements_located((By.XPATH,'//*[@id="app"]/div/div[1]/div[3]/a')))
for i in all_data: # 找到所有的a标签循环并点击
i.click()
# 浏览器切换对象
driver.switch_to.window(driver.window_handles[-1])
# 提取想要的信息
title = WAIT.until(EC.presence_of_element_located((By.XPATH,'//*[@id="content"]/h1/span[1]'))).text
images = WAIT.until(EC.presence_of_element_located((By.XPATH,'//*[@class="subject clearfix"]/div[1]/a/img'))).get_attribute('src')
ditail = WAIT.until(EC.presence_of_element_located((By.XPATH,'//div[@class="related-info"]/div/span'))).text
aoter = WAIT.until(EC.presence_of_element_located((By.XPATH,'//*[@class="subject clearfix"]/div[2]/span[1]'))).get_attribute('textContent') # 取文本
score = WAIT.until(EC.presence_of_element_located((By.XPATH,'//*[@class="subjectwrap clearfix"]/div[2]/div[1]/div[2]/strong'))).text
evaluate = WAIT.until(EC.presence_of_element_located((By.XPATH,'//*[@class="subjectwrap clearfix"]/div[2]/div[1]/div[2]/div/div/a/span'))).get_attribute('textContent')
item = {
'标题': title,
'图片': images,
'简介': ditail,
'作者': aoter,
'评分': score,
'评价': evaluate
}
all_list.append(item)
print(item)
save_Images(images,title)
driver.close()
driver.switch_to.window(driver.window_handles[0])
return all_list
def save_csv(all_list):
# 表头
headers = ['标题','图片','简介','作者','评分','评价']
# 打开文件
with open('豆瓣电影.csv','a+',encoding='utf-8',newline='')as f:
f_csv = csv.DictWriter(f,headers)
f_csv.writeheader()
f_csv.writerows(all_list)
def save_Images(images,title):
if not os.path.exists('豆瓣图片'): # 创建文件夹
os.mkdir('豆瓣图片')
images_data = requests.get(url=images).content
big = '[?/\|:"*]' # 剔除特殊字符保存图片
li = re.sub(big,"",title)
with open('豆瓣图片\\' + li + '.jpg',mode='wb')as f:
f.write(images_data)
print("保存图片===>",title)
def main():
url = 'https://movie.douban.com/'
WAIT = get_serch(url)
all_list = get_data(WAIT)
save_csv(all_list)
if __name__ == '__main__':
main()