爬虫存为多级嵌套JSON文件总结
1、(Array)数组
["v1","v2","v3",...]
2、(object)对象
{key1:value1,key2:value2,...}
3、数组中包含对象
[
"v1",
"v2",
{
"key": "value",
"key1": "value1",
}
]
4、对象中包含数组
{
"key1": "value1",
"key2": [
"v1",
"v2",
]
}
5、常用知识点
import requests
2 import json
3 '''
4 json.loads(json_str) json字符串转换成字典
5 json.dumps(dict) 字典转换成json字符串
6
7 '''
8 # 这是一个ajax发起的get请求,获取一个json对象
9 r = requests.get("https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?os=ios&for_mobile=1&start=0&count=18&loc_id=108288&_=0")
10 json_response = r.content.decode() # 获取r的文本 就是一个json字符串
11
12 # 将json字符串转换成dic字典对象
13 dict_json = json.loads(json_response)
14 print(type(dict_json))
15
16 # 将字典转换成json字符串
17 str_json = json.dumps( dict_json )
18 print(type(str_json))
19
20 # 字典转换成json 存入本地文件
21 with open('./a.txt','w') as f:
22 # 设置不转换成ascii json字符串首缩进
23 f.write( json.dumps( dict_json,ensure_ascii=False,indent=2 ) )
6、 爬虫例子
1、爬虫实际例子1
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import json
#构造头文件,模拟浏览器访问
# https://xian.baixing.com/fuwu/
url="https://xian.baixing.com/fuwu/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36','referer':url}
response=requests.get(url,headers=headers)
body=response.text #获取网页内容
html=etree.HTML(body,etree.HTMLParser())
gethtml=html.xpath('//div[contains(@class,"media-body-title")]')
# 存储为数组list
jsondata = []
for item in gethtml:
jsonone={}
jsonone['title']=item.xpath('.//a[contains(@class,"ad-title")]/text()')[0]
jsonone['url']=item.xpath('.//a[contains(@class,"ad-title")]/attribute::href')[0]
jsonone['phone']=item.xpath('.//button[contains(@class,"contact-button")]/attribute::data-contact')[0]
jsondata.append(jsonone)
# 保存为json
with open("./d.json",'w',encoding='utf-8') as json_file:
json.dump(jsondata,json_file,ensure_ascii=False)
# -*- coding: utf-8 -*-
from lxml import etree # 导入etree子模块
import requests, json
import sys
from bs4 import BeautifulSoup
import importlib
from fake_useragent import UserAgent
user_agent = UserAgent().random # 用户代理
url_01 = 'https://www.daomu1234.com/book/21939/'
headers = {'User-Agent': user_agent,
'referer':url_01
}
response_01 = requests.get(url_01, headers=headers)
if response_01.status_code==200: # 如果请求成功
# print(response_01.text) # 打印验证后的HTML代码
print('请求成功')
html_01 = etree.HTML(response_01.text) # 解析html字符串
content = []
title_0 = html_01.xpath('/html/body/div[1]/div[2]/div[1]/div[1]/div[2]/h1')
for title_01 in title_0:
title = title_01.xpath('./text()')[0]
print(title)
chapter = []
select = html_01.xpath('//*[@id="play_0"]/ul/li[position()>6]')
for select_01 in select:
i_0 = select_01.xpath('./a/text()')[0]
i_1 = select_01.xpath('./a/@href')[0]
# print(i_0)
# print(i_1)
chapter.append({'href': i_1, 'chapter': i_0})
content.append({'book': title, 'content': chapter})
with open('daomubiji0.json', 'w') as fp: # 将所得的数据存储为json文件
json.dump(content, fp=fp, ensure_ascii=False, indent=4, sort_keys=True)