您当前的位置: 首页 >  Python

嗨学编程

暂无认证

  • 1浏览

    0关注

    1405博文

    0收益

  • 0浏览

    0点赞

    0打赏

    0留言

私信
关注
热门博文

python爬虫前提技术

嗨学编程 发布时间:2019-04-19 20:04:38 ,浏览量:1

1、BeautifulSoup 解析html如何使用

'''
想要学习Python?Python学习交流群:973783996满足你的需求,资料都已经上传群文件,可以自行下载!
'''

#!/usr/bin/python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import re

#待分析字符串
html_doc = """


    The Dormouse's story


The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" # html字符串创建BeautifulSoup对象 soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8') #输出第一个 title 标签 print soup.title #输出第一个 title 标签的标签名称 print soup.title.name #输出第一个 title 标签的包含内容 print soup.title.string #输出第一个 title 标签的父标签的标签名称 print soup.title.parent.name #输出第一个 p 标签 print soup.p #输出第一个 p 标签的 class 属性内容 print soup.p['class'] #输出第一个 a 标签的 href 属性内容 print soup.a['href'] ''' soup的属性可以被添加,删除或修改. 再说一次, soup的属性操作方法与字典一样 ''' #修改第一个 a 标签的href属性为 http://www.baidu.com/ soup.a['href'] = 'http://www.baidu.com/' #给第一个 a 标签添加 name 属性 soup.a['name'] = u'百度' #删除第一个 a 标签的 class 属性为 del soup.a['class'] ##输出第一个 p 标签的所有子节点 print soup.p.contents #输出第一个 a 标签 print soup.a #输出所有的 a 标签,以列表形式显示 print soup.find_all('a') #输出第一个 id 属性等于 link3 的 a 标签 print soup.find(id="link3") #获取所有文字内容 print(soup.get_text()) #输出第一个 a 标签的所有属性信息 print soup.a.attrs for link in soup.find_all('a'): #获取 link 的 href 属性内容 print(link.get('href')) #对soup.p的子节点进行循环输出 for child in soup.p.children: print(child) #正则匹配,名字中带有b的标签 for tag in soup.find_all(re.compile("b")): print(tag.name) 2、cookie等使用方法以及函数爬虫 参照: https://cuiqingcai.com/968.html 3、header,代理,超时,认证,异常处理 参照: http://blog.csdn.net/m_buddy/article/details/55193762 4、错误异常处理 1.URLError # -*- coding: UTF-8 -*- import urllib import urllib from urllib import request import re import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError if __name__ == "__main__": #一个不存在的连接 url = "http://www.douyu.com/Jack_Cui.html" request = urllib.request.Request(url) try: response = urllib.request.urlopen(request) # html = responese.read() except urllib.error.HTTPError as e: print(e.code) 运行结果: C:\Python34\python.exe G:/xiaoshuo2.py Process finished with exit code 0 # -*- coding: UTF-8 -*- import urllib import urllib from urllib import request import re import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError if __name__ == "__main__": #一个不存在的连接 url = "http://www.douyu.com/Jack_Cui.html" request = urllib.request.Request(url) try: response = urllib.request.urlopen(request) html = response.read().decode('utf-8') print(html) except urllib.error.HTTPError as e: print(e.code) 运行结果: C:\Python34\python.exe G:/xiaoshuo2.py Process finished with exit code 0 import urllib import urllib from urllib import request import re import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError url = "http://www.douyu.com/Jack_Cui.html" rep=urllib.request.Request(url) try: data=urllib.request.urlopen(rep) except urllib.error.URLError as e: if hasattr(e,'code'): print("HTTPError") print(e.code) if hasattr(e,'reason' ): print("URLError") print(e.reason) 输出结果: C:\Python34\python.exe G:/xiaoshuo2.py HTTPError URLError Forbidden Process finished with exit code 0 5、python打印防止换行和换行 https://www.cnblogs.com/kfx2007/p/5970784.html 实例: # coding=utf-8 import re language = ''''' jenkins 性別: 男d 異名: (字) 翔宇 爱好: 篮球 籍貫: 广西省桂林市 ''' #获取table中tr值 res_tr = r'(.*?)' m_tr = re.findall(res_tr,language,re.S|re.M) for line in m_tr: #获取表格第一列th 属性 res_th = r'(.*?)' m_th = re.findall(res_th,line,re.S|re.M) for mm in m_th: if "href" in mm: #如果获取加粗的th中含超链接则处理 restr = r'(.*?)' h = re.findall(restr,mm,re.S|re.M) print (h[0],end=' ') #逗号连接属性值 防止换行 else: print (mm,end=' ') #unicode防止乱 #获取表格第二列td 属性值 res_td = r'(.*?)' #r'(.*?)' m_td = re.findall(res_td,line,re.S|re.M) for nn in m_td: if "href" in nn: #处理超链接 res_value = r'(.*?)' m_value = re.findall(res_value,nn,re.S|re.M) for value in m_value: print (value,end=' ') elif "span" in nn: #处理标签 res_value = r'(.*?)' m_value = re.findall(res_value,nn,re.S|re.M) #(字) 翔宇 for value in m_value: print (value,end=' ') else: print (nn,end=' ') print (' ') #换行 C:\Python34\python.exe G:/xiaoshuo2.py 性別: 男 異名: (字) 翔宇 爱好: 篮球 籍貫: 广西省 桂林市 6、python打印如何呢不换行 https://www.cnblogs.com/hwd9654/p/5707920.html # -*- coding:utf-8 -*- import urllib import re #import requests import urllib.parse import urllib.request from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError class Tool: removeImg = re.compile('| {7}|') removeAddr = re.compile('|') replaceLine = re.compile('|
|
|

') replaceTD= re.compile('') replacePara = re.compile('') replaceBR = re.compile('|') removeExtraTag = re.compile('') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) return x.strip() class BDTB: def __init__(self,baseUrl,seeLZ): self.baseURL = baseUrl self.seeLZ = '?see_lz='+str(seeLZ) self.tool = Tool() def getPage(self,pageNum): try: url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum) request = urllib.request.Request(url) response = urllib.request.urlopen(request).read().decode("utf8") #print (response) return response except urllib.error.URLError as e: if hasattr(e,"reason"): print ("连接百度贴吧失败,错误原因",e.reason) return None def getTitle(self): page = self.getPage(1) pattern = re.compile('
关注
打赏
1663681728
查看更多评论
0.0859s