推荐一个Chrome浏览器的xpath解析工具: xPath helper
轻松获取HTML元素的xPath 打开/关闭控制台:Ctrl-Shift键-X
参考:介绍一款chrome爬虫网页解析工具-XPath Helper
scrapy基本介绍# 创建工程 scrapy startproject myspider
# 创建爬虫 scrapy genspider itcast itcast.cn
# 查看爬虫 scrapy list
# 运行爬虫 scrapy crawl itcast
# 4种格式: json jsonl csv xml 默认为unicode编码
# 输出为json格式:scrapy crawl itcast -o data.json
# 启用终端 scrapy shell url
# response.headers
# response.body
# 选择器提取数据返回都是列表
# response.xpath() 提取出来的是一个列表
# response.css()
# extract() 将xpath对象转为Unicode字符串对象
# re() 正则
完整代码
为了说明spider、item、pipeline三个类的基本用法,代码比较冗余
# itcast_spider.py
# -*- coding: utf-8 -*-
import scrapy
from myspider.items.itcast_item import ItcastItem
# py2解决编码问题
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class ItcastSpider(scrapy.Spider):
# 爬虫名称,必须且唯一
name = "itcast"
# 限定爬取范围(可选)
allowed_domains = ["itcast.cn"]
# 配置处理item的pipeline
custom_settings = {
"ITEM_PIPELINES":{
"myspider.pipelines.itcast_pipeline.ItcastPipeline":1,
}
}
# 爬取的第一批url列表
start_urls = [
"http://www.itcast.cn/channel/teacher.shtml"
]
def parse(self, response):
# 解析每个链接的教师列表
for li_txt in response.css(".li_txt"):
name = li_txt.xpath("./h3/text()").extract()[0]
title = li_txt.xpath("./h4/text()").extract()[0]
info = li_txt.xpath("./p/text()").extract()[0]
# 将数据放入item中返回给pipeline
item = ItcastItem()
item["name"] = name
item["title"] = title
item["info"] = info
yield item
# itcast_item.py
# -*- coding: utf-8 -*-
import scrapy
class ItcastItem(scrapy.Item):
name = scrapy.Field() # 姓名
title = scrapy.Field() # 职称
info = scrapy.Field() # 详细信息
# itcast_pipline.py
# -*- coding: utf-8 -*-
import json
class ItcastPipeline(object):
# 类只实例化一次
def __init__(self):
print "@@@@@@爬虫初始化"
self.f = open("itcast.json", "w")
self.count = 0 # 计数
def process_item(self, item, spider):
# 必须实现的方法
dct = json.dumps(dict(item), ensure_ascii=False)
self.f.write(dct.encode("utf-8")+"\n")
self.count += 1
return item # 必须返回,让其他管道处理
def open_spider(self, spider):
print "@@@@@@爬虫打开"
def close_spider(self, spider):
self.f.close()
print "@@@@@@爬虫关闭"
print "爬取数据条数:%s" % self.count