您当前的位置: 首页 >  Python

Python爬虫:xpath常用方法示例

彭世瑜 发布时间:2018-05-18 14:53:26 ,浏览量:5

# -*-coding:utf-8-*-

html = """

 
  
  Example website
 
 
  
Name: My image 1 Name: My image 2 Name: My image 3 Name: My image 4 Name: My image 5
""" from scrapy.selector import Selector sel = Selector(text=html) print("================title===============") title_by_xpath = sel.xpath("//title//text()").extract_first() print(title_by_xpath) title_by_css = sel.css("title::text").extract_first() print(title_by_css) print("================href===============") hrefs = sel.xpath("//a/@href").extract() print(hrefs) hrefs_by_css = sel.css("a::attr(href)").extract() print(hrefs_by_css) print("================img===============") imgs = sel.xpath("//a[contains(@href, 'image')]/@href").extract() print(imgs) imgs_by_css = sel.css("a[href*=image]::attr(href)").extract() print(imgs_by_css) print("================src===============") src = sel.xpath("//a[contains(@href, 'image')]/img/@src").extract() print(src) src_by_css = sel.css("a[href*=image] img::attr(src)").extract() print(src_by_css) print("================ re ===============") text_by_re = sel.css("a[href*=image]::text").re(r"Name:\s*(.*)") print(text_by_re) print("================ xpath ===============") div = sel.xpath("//div") # 相对路径 print(div) a = div.xpath(".//a").extract() # 从当前提取所有元素 print(a) print("================ text ===============") text='Click here to go to the Next Page' sel1 = Selector(text=text) # a下面的文字 a = sel1.xpath("//a/text()").extract() print(a) # a 下面所有的文字,包括strong a = sel1.xpath("//a//text()").extract() print(a) # 解析出所有文字内容 a = sel1.xpath("string(//a)").extract() print(a) a = sel1.xpath("string(.)").extract() print(a) # 简化写法,推荐 xp = lambda x: sel.xpath(x).extract() all_a = xp("//a/text()") print(all_a)
关注
打赏
1688896170
查看更多评论

彭世瑜

暂无认证

  • 5浏览

    0关注

    2727博文

    0收益

  • 0浏览

    0点赞

    0打赏

    0留言

私信
关注
热门博文
立即登录/注册

微信扫码登录

0.0783s