python+requests+BeautifulSoup实现保存二手车信息至excel

王同学在这发布时间：2022-03-17 15:50:15 ，浏览量：4

文章目录

- 前言:
- 网页分析
- 主要思路
- 实现步骤
- 打印结果
- 实现
- 实现全部代码：

提示：以下是本篇文章正文内容，下面案例可供参考

前言:

零基础入门Python的先从python基础学起，打好基础再慢慢接触后面的内容，给自己找任务，多做做练习，也建议大家学Python时一定要多写多看。

网页分析

节约时间，不废话介绍了，直接上例子！！！我们就使用 requests 和 BeautifulSoup来写一个,每个人都有自己喜欢的车吧！（包括我也是）所以说这章我们就来实现把车辆信息存储下来到本地，浏览车价格、情况等，等哪天暴富再买下来。在这里插入图片描述 ok，下面就是开始学习python的正确姿势，请在电脑陪同下进行操作。

首先我们要打开网址，进行网页分析，了解网页才能好对网页实现哪种方法。进入网址之后我们点击买车，就能看到车所有的信息了。在这里插入图片描述

还是老样子确定一下网页是动态的还是静态的先，在网页源码中能输入关键字查找到，说明这是静态的，那么我们接下来就可以用常规的方法对网址进行实现。在这里插入图片描述

每一页显示 48辆车当我们点击下一页的时候你可以发现地址变了， page=2#pagetag page=3#pagetag page=4#pagetag

这个可以观察发现，page=2那里的数字代表着翻页，到后面我们实现翻页的时候直接可以搞个循环变量进行翻页，来获取不同页面的内容。在这里插入图片描述明确我们需要的信息是车名

价格

封面图片

里程

发动机

排量

上牌时间

通过页面源码我们可以了解到这些信息都放在ul标签的li标签中。在这里插入图片描述

那么接下来我们就可以发起请求，用BeautifulSoup来提取我们所需要的信息，分析完成之后我们就开始操作吧。在这里插入图片描述

主要思路

1用request发起请求 2写个循环变量来实现翻页 3然后将返回的 HTML 进行BeautifulSoup解析 4解析完之后就把内容存到文件中

实现步骤

导入相应的的库之后，构造伪造头防止简单的反爬，接下来对网址发起请求，返回text。

import requests
from bs4 import BeautifulSoup
import xlwt
import os
import re

# 
def get_content(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    try:
        response = requests.get(url,headers)
        response.encoding = response.apparent_encoding  # 自动转码
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print('error',e)
        return None

拿到网页源码之后我们就可以用BeautifulSoup解析进行解析内容，找到所以的车标签li，循环每一个li用BeautifulSoup标签和属性的方法找到我们所需的，拿到详情页，再进详情页提取内容。

def get_data(response):
    # 实例化对象
    soup = BeautifulSoup(response,'lxml')
    # 全部li标签
    all_data = soup.find('ul',class_="gongge_ul").find_all('li')
    for i in all_data:
        title = i.find('div',class_="gongge_main").find('span').text
        images = i.find('div',class_="item_img").find('img').get('data-src')
        gl = i.find('div',class_="gongge_main").find_all('i')[1].text
        # 详情页
        ditail = i.find('a').get('href')

这是车的详情页在这里插入图片描述有详情页我们再对详情页发起请求进入详情页提取内容，也是使用网页源码的标签和属性进行对内容的提取，这里进入详情页提取完我们的信息后，调用其他参数传参。

# 详情页
        ditail_data = requests.get(url=ditail).text
        soup_li = BeautifulSoup(ditail_data,'lxml')
        for xt in soup_li.find_all(class_="detail-wrapper"):
            momey = xt.find(class_="price-this").text
            ml = xt.find(class_="col-xs-6 parameter-configure-list").find_all('li')[2].text.replace('\n','').replace(' ','')
            pl = xt.find(class_="summary-attrs").find_all('dl')[2].text
            time = xt.find('div',class_="summary-attrs").find('dl').text.replace('上牌时间','')
            content = xt.find('div',class_="row parameter-configure").text.replace('\n','').replace(' ','')
            item = {
                '标题':title,
                '图片':images,
                '里程':gl,
                '价格':momey,
                '情况':ml,
                '万里':pl,
                '时间':time,
                '详情':content
            }
            print(item)
            save_CSV(title,images,gl,momey,ml,pl,time,content)
            save_Images(title,images)

信息提取完成之后就是对数据进行保存到Exel表

def save_CSV(title,images,gl,momey,ml,pl,time,content):
        global n
        sheet.write(n, 0, title)
        sheet.write(n, 1, images)
        sheet.write(n, 2, gl)
        sheet.write(n, 3, momey)
        sheet.write(n, 4, ml)
        sheet.write(n, 5, pl)
        sheet.write(n, 6, time)
        sheet.write(n, 7, content)
        n = n + 1
        book.save(u'汽车.xlsx')
        print('正在保存===>:',title)

最后就是对封面保存到本地了。

def save_Images(title,images):
    if not os.path.exists('tche'):
        os.mkdir('tche')      # 创建文件夹
    images_data = requests.get(url=images).content
    big = '[?/\|:"*]'     # 剔除特殊字符
    li = re.sub(big,"",title)
    with open('tche\\' + li + '.jpg', mode='wb')as f:
        f.write(images_data)
        print('正在保存图片=====>:',title)

打印结果

这里实现个循环变量来翻页。（需要更多页的可更改数字）

在这里插入图片描述

到这里我们就对车的信息进行保存至Exel了。

实现

在这里插入图片描述

实现全部代码：

import requests
from bs4 import BeautifulSoup
import xlwt
import os
import re

# 发送请求
def get_content(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    try:
        response = requests.get(url,headers)
        response.encoding = response.apparent_encoding  # 自动转码
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print('error',e)
        return None




# 打开xlwt
book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('汽车', cell_overwrite_ok=True)
sheet.write(0, 0, '名称')
sheet.write(0, 1, '图片')
sheet.write(0, 2, '万里')
sheet.write(0, 3, '价格')
sheet.write(0, 4, '马力')
sheet.write(0, 5, '排量')
sheet.write(0, 6, '上牌时间')
sheet.write(0, 7, '详情信息')
n = 1

def get_data(response):
    # 
    soup = BeautifulSoup(response,'lxml')
    # 找li
    all_data = soup.find('ul',class_="gongge_ul").find_all('li')
    for i in all_data:
        title = i.find('div',class_="gongge_main").find('span').text
        images = i.find('div',class_="item_img").find('img').get('data-src')
        gl = i.find('div',class_="gongge_main").find_all('i')[1].text
        # 
        ditail = i.find('a').get('href')

        # 
        ditail_data = requests.get(url=ditail).text
        soup_li = BeautifulSoup(ditail_data,'lxml')
        for xt in soup_li.find_all(class_="detail-wrapper"):
            momey = xt.find(class_="price-this").text
            ml = xt.find(class_="col-xs-6 parameter-configure-list").find_all('li')[2].text.replace('\n','').replace(' ','')
            pl = xt.find(class_="summary-attrs").find_all('dl')[2].text
            time = xt.find('div',class_="summary-attrs").find('dl').text.replace('上牌时间','')
            content = xt.find('div',class_="row parameter-configure").text.replace('\n','').replace(' ','')
            item = {
                '标题':title,
                '图片':images,
                '里程':gl,
                '价格':momey,
                '情况':ml,
                '万里':pl,
                '时间':time,
                '信息':content
            }
            print(item)
            save_CSV(title,images,gl,momey,ml,pl,time,content)
            save_Images(title,images)



def save_CSV(title,images,gl,momey,ml,pl,time,content):
        global n
        sheet.write(n, 0, title)
        sheet.write(n, 1, images)
        sheet.write(n, 2, gl)
        sheet.write(n, 3, momey)
        sheet.write(n, 4, ml)
        sheet.write(n, 5, pl)
        sheet.write(n, 6, time)
        sheet.write(n, 7, content)
        n = n + 1
        book.save(u'二手车.xlsx')
        print('正在保存===>:',title)



def save_Images(title,images):
    if not os.path.exists('che'):
        os.mkdir('che')      # 创建文件夹
    images_data = requests.get(url=images).content
    big = '[?/\|:"*]'     # 剔除特殊字符
    li = re.sub(big,"",title)
    with open('che\\' + li + '.jpg', mode='wb')as f:
        f.write(images_data)
        print('正在保存图片=====>:',title)



def main(page):
    url = '。。。。。。。。。'
    print(f'==============================正在保存第{page}页的数据内容==============================')
    response = get_content(url)
    get_data(response)


if __name__ == '__main__':
    for i in range(1,6):
        main(page=i)

在这里插入图片描述

关注

打赏

1649064435

查看更多评论

python+requests+BeautifulSoup实现保存二手车信息至excel

最近更新

热门博客

[ 申请 ]友情链接：