前言
嗨喽 ! 大家好吖,这里是魔王~
亚马逊公司,是美国最大的一家网络电子商务公司,位于华盛顿州的西雅图 是网络上最早开始经营电子商务的公司之一,亚马逊成立于1994年 今天教大家用Python批量采集亚马逊平台商品数据 地址:https://www.amazon.cn/
本次目的:Python批量采集亚马逊商品数据
知识点:- 爬虫基本流程
- 非结构化数据解析
- csv数据保存
- 线程池的使用
- python 3.8
- pycharm
- requests >>> pip install requests
- parsel >>> pip install parsel
爬虫(python):
批量采集网络数据工具(视频 文本 图片 音频)
本质:
模仿 客户端(浏览器) 发送网络请求
基本爬虫思路 网站思路分析:找到数据来源 https://www.amazon.cn/s?rh=n%3A106200071&fs=true&ref=lp_106200071_sar
代码实现:- 发送网络请求 requests 第三方模块 发送请求
- 获取数据
- 解析数据 parsel ? 帮助解析数据 模块 第三方模块
- 保存数据
- 多页爬取
- 代码封装成函数 使用线程池爬取
import requests # 第三方模块
import parsel # 解析数据模块
import csv
加入伪装
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'session-id=460-4132650-8765807; i18n-prefs=CNY; ubid-acbcn=457-7935785-7667244; session-token=Laa3G6hMbBpdAIPYwutQqKxkSISU8fb2jTr0JiczqkeVISvqn2eqjw4N0BAbYfmy8+/S1B3kLvDb9ImsBnbwQHU6JG8EToefDoi69keaL1F6ExYDXCSqFF0hC4fkGAFJlNYYNqfVlvj5ewTVJP1pYgL4JG2tjM5O2Uk7ufiL9s7gvidAMaUj1QtBW5puqmoG; csm-hit=adb:adblk_no&t:1645531896484&tb:s-VMQ97YXPSC1MBACTN14J|1645531895768; session-id-time=2082729601l',
'downlink': '10',
'ect': '4g',
'Host': 'www.amazon.cn',
'Referer': 'https://www.amazon.cn/b/ref=s9_acss_bw_cg_pccateg_2a1_w?node=106200071&pf_rd_m=A1U5RCOVU0NYF2&pf_rd_s=merchandised-search-2&pf_rd_r=KE929JDVF8QRWWDQCWC0&pf_rd_t=101&pf_rd_p=cdcd9a0d-d7cf-4dab-80db-2b7d63266973&pf_rd_i=42689071',
'rtt': '150',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}
for page in range(1, 401):
url = f'https://www.amazon.cn/s?i=computers&rh=n%3A106200071&fs=true&page={page}&qid=1645537294&ref=sr_pg_3'
1. 发送网络请求
response = requests.get(url=url, headers=headers)
2. 获取数据 : 请求成功
data_html = response.text
# print(data_html)
3. 解析数据 html + css 网站开发内容
selector = parsel.Selector(data_html)
divs = selector.css('.a-section.a-spacing-base')
for div in divs:
# ::text: 提取到标签文本内容
title = div.css('.a-size-base-plus.a-color-base.a-text-normal::text').get()
price = div.css('.a-size-base.a-link-normal.s-underline-text.s-underline-link-text.s-link-style.a-text-normal .a-price .a-offscreen::text').get()
img_url = div.css('.a-section.aok-relative.s-image-square-aspect .s-image::attr(src)').get()
link = div.css('.a-link-normal.s-no-outline::attr(href)').get()
print(title, price, img_url, link)
4. 保存数据
with open('亚马逊.csv', mode='a', encoding='utf-8', newline='') as f:
csv_writer = csv.writer(f)
csv_writer.writerow([title, price, img_url, link])
多线程
导入模块
import requests # 第三方模块
import parsel # 解析数据模块
import csv
import concurrent.futures
加个伪装
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'session-id=460-4132650-8765807; i18n-prefs=CNY; ubid-acbcn=457-7935785-7667244; session-token=Laa3G6hMbBpdAIPYwutQqKxkSISU8fb2jTr0JiczqkeVISvqn2eqjw4N0BAbYfmy8+/S1B3kLvDb9ImsBnbwQHU6JG8EToefDoi69keaL1F6ExYDXCSqFF0hC4fkGAFJlNYYNqfVlvj5ewTVJP1pYgL4JG2tjM5O2Uk7ufiL9s7gvidAMaUj1QtBW5puqmoG; csm-hit=adb:adblk_no&t:1645531896484&tb:s-VMQ97YXPSC1MBACTN14J|1645531895768; session-id-time=2082729601l',
'downlink': '10',
'ect': '4g',
'Host': 'www.amazon.cn',
'Referer': 'https://www.amazon.cn/b/ref=s9_acss_bw_cg_pccateg_2a1_w?node=106200071&pf_rd_m=A1U5RCOVU0NYF2&pf_rd_s=merchandised-search-2&pf_rd_r=KE929JDVF8QRWWDQCWC0&pf_rd_t=101&pf_rd_p=cdcd9a0d-d7cf-4dab-80db-2b7d63266973&pf_rd_i=42689071',
'rtt': '150',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}
def get_html(url):
1. 发送网络请求
response = requests.get(url=url, headers=headers)
2. 获取数据 : 请求成功
data_html = response.text
return data_html
def parse_data(data_html):
3. 解析数据 html + css 网站开发内容
selector = parsel.Selector(data_html)
divs = selector.css('.a-section.a-spacing-base')
info_list = []
for div in divs:
# ::text: 提取到标签文本内容
title = div.css('.a-size-base-plus.a-color-base.a-text-normal::text').get()
price = div.css('.a-size-base.a-link-normal.s-underline-text.s-underline-link-text.s-link-style.a-text-normal .a-price .a-offscreen::text').get()
img_url = div.css('.a-section.aok-relative.s-image-square-aspect .s-image::attr(src)').get()
link = div.css('.a-link-normal.s-no-outline::attr(href)').get()
info_list.append([title, price, img_url, link])
return info_list
def save_data(info_list):
4. 保存数据
print(info_list)
with open('亚马逊.csv', mode='a', encoding='utf-8', newline='') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(info_list)
def run(url):
# 1. 发送请求 并且获取数据
data_html = get_html(url)
# 2. 解析数据
info_list = parse_data(data_html)
# 3. 保存数据
save_data(info_list)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as exe:
for page in range(1, 401):
url = f'https://www.amazon.cn/s?i=computers&rh=n%3A106200071&fs=true&page={page}&qid=1645537294&ref=sr_pg_3'
exe.submit(run, url)
# for page in range(1, 401):
# url = f'https://www.amazon.cn/s?i=computers&rh=n%3A106200071&fs=true&page={page}&qid=1645537294&ref=sr_pg_3'
# run(url)
尾语
好了,我的这篇文章写到这里就结束啦!
有更多建议或问题可以评论区或私信我哦!一起加油努力叭(ง •_•)ง
喜欢就关注一下博主,或点赞收藏评论一下我的文章叭!!!