您当前的位置: 首页 >  json

爬虫存为多级嵌套JSON文件总结

发布时间:2022-07-12 22:14:46 ,浏览量:5

爬虫存为多级嵌套JSON文件总结 1、(Array)数组
["v1","v2","v3",...] 
2、(object)对象
{key1:value1,key2:value2,...} 
3、数组中包含对象
[ "v1", "v2", { "key": "value", "key1": "value1", } ] 
4、对象中包含数组
{ "key1": "value1", "key2": [ "v1", "v2", ] } 
5、常用知识点
import requests 2 import json 3 '''
 4 json.loads(json_str) json字符串转换成字典
 5 json.dumps(dict) 字典转换成json字符串 
 6 
 7 ''' 8 # 这是一个ajax发起的get请求,获取一个json对象 9 r = requests.get("https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?os=ios&for_mobile=1&start=0&count=18&loc_id=108288&_=0") 10 json_response = r.content.decode() # 获取r的文本 就是一个json字符串 11 12 # 将json字符串转换成dic字典对象 13 dict_json = json.loads(json_response) 14 print(type(dict_json)) 15 16 # 将字典转换成json字符串 17 str_json = json.dumps( dict_json ) 18 print(type(str_json)) 19 20 # 字典转换成json 存入本地文件 21 with open('./a.txt','w') as f: 22 # 设置不转换成ascii  json字符串首缩进 23 f.write( json.dumps( dict_json,ensure_ascii=False,indent=2 ) ) 
6、 爬虫例子 1、爬虫实际例子1
# -*- coding: utf-8 -*- import requests from lxml import etree import json #构造头文件,模拟浏览器访问 # https://xian.baixing.com/fuwu/ url="https://xian.baixing.com/fuwu/" headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36','referer':url} response=requests.get(url,headers=headers) body=response.text #获取网页内容 html=etree.HTML(body,etree.HTMLParser()) gethtml=html.xpath('//div[contains(@class,"media-body-title")]') # 存储为数组list jsondata = [] for item in gethtml: jsonone={} jsonone['title']=item.xpath('.//a[contains(@class,"ad-title")]/text()')[0] jsonone['url']=item.xpath('.//a[contains(@class,"ad-title")]/attribute::href')[0] jsonone['phone']=item.xpath('.//button[contains(@class,"contact-button")]/attribute::data-contact')[0] jsondata.append(jsonone) # 保存为json with open("./d.json",'w',encoding='utf-8') as json_file: json.dump(jsondata,json_file,ensure_ascii=False) 

在这里插入图片描述 在这里插入图片描述

2 、爬虫实际例子2
# -*- coding: utf-8 -*- from lxml import etree # 导入etree子模块 import requests, json import sys from bs4 import BeautifulSoup import importlib from fake_useragent import UserAgent

user_agent = UserAgent().random # 用户代理 url_01 = 'https://www.daomu1234.com/book/21939/' headers = {'User-Agent': user_agent, 'referer':url_01 } response_01 = requests.get(url_01, headers=headers) if response_01.status_code==200: # 如果请求成功 # print(response_01.text)                 # 打印验证后的HTML代码 print('请求成功') html_01 = etree.HTML(response_01.text) # 解析html字符串 content = [] title_0 = html_01.xpath('/html/body/div[1]/div[2]/div[1]/div[1]/div[2]/h1') for title_01 in title_0: title = title_01.xpath('./text()')[0] print(title) chapter = [] select = html_01.xpath('//*[@id="play_0"]/ul/li[position()>6]') for select_01 in select: i_0 = select_01.xpath('./a/text()')[0] i_1 = select_01.xpath('./a/@href')[0] # print(i_0) # print(i_1) chapter.append({'href': i_1, 'chapter': i_0}) content.append({'book': title, 'content': chapter}) with open('daomubiji0.json', 'w') as fp: # 将所得的数据存储为json文件 json.dump(content, fp=fp, ensure_ascii=False, indent=4, sort_keys=True) 

在这里插入图片描述 在这里插入图片描述

关注
打赏
1688896170
查看更多评论

暂无认证

  • 5浏览

    0关注

    115984博文

    0收益

  • 0浏览

    0点赞

    0打赏

    0留言

私信
关注
热门博文
立即登录/注册

微信扫码登录

0.3257s