爬虫存为多级嵌套JSON文件总结

默默爬行的虫虫发布时间：2022-07-12 22:14:46 ，浏览量：5

爬虫存为多级嵌套JSON文件总结 1、(Array)数组

 ["v1","v2","v3",...]

2、(object)对象

 {key1：value1,key2：value2,...}

3、数组中包含对象

[
    "v1",
    "v2",
    {
        "key": "value",
         "key1": "value1",
    }
]

4、对象中包含数组

{
    "key1": "value1",
    "key2": [  
        "v1",
        "v2",
    ]
}

5、常用知识点

 import requests
 2 import json
 3 '''
 4 json.loads(json_str) json字符串转换成字典
 5 json.dumps(dict) 字典转换成json字符串 
 6 
 7 '''
 8 # 这是一个ajax发起的get请求，获取一个json对象
 9 r = requests.get("https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?os=ios&for_mobile=1&start=0&count=18&loc_id=108288&_=0")
10 json_response = r.content.decode()  # 获取r的文本 就是一个json字符串
11 
12 # 将json字符串转换成dic字典对象
13 dict_json = json.loads(json_response)
14 print(type(dict_json))
15 
16 # 将字典转换成json字符串
17 str_json = json.dumps( dict_json )
18 print(type(str_json))
19 
20 # 字典转换成json 存入本地文件
21 with open('./a.txt','w') as f:
22     # 设置不转换成ascii  json字符串首缩进
23     f.write( json.dumps( dict_json,ensure_ascii=False,indent=2 ) )

6、爬虫例子 1、爬虫实际例子1


# -*- coding: utf-8 -*-


import requests
from lxml import etree
import json
#构造头文件，模拟浏览器访问
# https://xian.baixing.com/fuwu/
url="https://xian.baixing.com/fuwu/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36','referer':url}
response=requests.get(url,headers=headers)
body=response.text #获取网页内容
html=etree.HTML(body,etree.HTMLParser())
gethtml=html.xpath('//div[contains(@class,"media-body-title")]')
# 存储为数组list
jsondata = []
for item in gethtml:
  jsonone={}
  jsonone['title']=item.xpath('.//a[contains(@class,"ad-title")]/text()')[0]
  jsonone['url']=item.xpath('.//a[contains(@class,"ad-title")]/attribute::href')[0]
  jsonone['phone']=item.xpath('.//button[contains(@class,"contact-button")]/attribute::data-contact')[0]
  jsondata.append(jsonone)
# 保存为json
with open("./d.json",'w',encoding='utf-8') as json_file:
  json.dump(jsondata,json_file,ensure_ascii=False)

在这里插入图片描述

2 、爬虫实际例子2

# -*- coding: utf-8 -*-
from lxml import etree    # 导入etree子模块
import requests, json
import sys
from bs4 import BeautifulSoup
import importlib
from fake_useragent import UserAgent

user_agent = UserAgent().random  # 用户代理
url_01 = 'https://www.daomu1234.com/book/21939/'
headers = {'User-Agent': user_agent,
            'referer':url_01
           }

response_01 = requests.get(url_01, headers=headers)


if response_01.status_code==200:            # 如果请求成功
    # print(response_01.text)                 # 打印验证后的HTML代码
    print('请求成功')
    html_01 = etree.HTML(response_01.text)  # 解析html字符串
    content = []

    title_0 = html_01.xpath('/html/body/div[1]/div[2]/div[1]/div[1]/div[2]/h1')
    for title_01 in title_0:
        title = title_01.xpath('./text()')[0]
        print(title)
        chapter = []

        select = html_01.xpath('//*[@id="play_0"]/ul/li[position()>6]')

        for select_01 in select:
            i_0 = select_01.xpath('./a/text()')[0]
            i_1 = select_01.xpath('./a/@href')[0]
            # print(i_0)
            # print(i_1)

            chapter.append({'href': i_1, 'chapter': i_0})
        content.append({'book': title, 'content': chapter})
    with open('daomubiji0.json', 'w') as fp:  # 将所得的数据存储为json文件
        json.dump(content, fp=fp, ensure_ascii=False, indent=4, sort_keys=True)

在这里插入图片描述

关注

打赏

1688896170

查看更多评论

爬虫存为多级嵌套JSON文件总结

[ 申请 ]友情链接：