python爬虫前提技术

嗨学编程发布时间：2019-04-19 20:04:38 ，浏览量：4
1、BeautifulSoup 解析html如何使用

'''
想要学习Python？Python学习交流群：973783996满足你的需求，资料都已经上传群文件，可以自行下载！
'''

#!/usr/bin/python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import re

#待分析字符串
html_doc = """


    The Dormouse's story



    
        The Dormouse's story
    


Once upon a time there were three little sisters; and their names were
    Elsie,
    Lacie 
    and
    Tillie;
    and they lived at the bottom of a well.


...
"""


# html字符串创建BeautifulSoup对象
soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')

#输出第一个 title 标签
print soup.title

#输出第一个 title 标签的标签名称
print soup.title.name

#输出第一个 title 标签的包含内容
print soup.title.string

#输出第一个 title 标签的父标签的标签名称
print soup.title.parent.name

#输出第一个  p 标签
print soup.p

#输出第一个  p 标签的 class 属性内容
print soup.p['class']

#输出第一个  a 标签的  href 属性内容
print soup.a['href']
'''
soup的属性可以被添加,删除或修改. 再说一次, soup的属性操作方法与字典一样
'''
#修改第一个 a 标签的href属性为 http://www.baidu.com/
soup.a['href'] = 'http://www.baidu.com/'

#给第一个 a 标签添加 name 属性
soup.a['name'] = u'百度'

#删除第一个 a 标签的 class 属性为
del soup.a['class']

##输出第一个  p 标签的所有子节点
print soup.p.contents

#输出第一个  a 标签
print soup.a

#输出所有的  a 标签，以列表形式显示
print soup.find_all('a')

#输出第一个 id 属性等于  link3 的  a 标签
print soup.find(id="link3")

#获取所有文字内容
print(soup.get_text())

#输出第一个  a 标签的所有属性信息
print soup.a.attrs


for link in soup.find_all('a'):
    #获取 link 的  href 属性内容
    print(link.get('href'))

#对soup.p的子节点进行循环输出    
for child in soup.p.children:
    print(child)

#正则匹配，名字中带有b的标签
for tag in soup.find_all(re.compile("b")):
    print(tag.name)



2、cookie等使用方法以及函数爬虫

参照： https://cuiqingcai.com/968.html


3、header,代理,超时,认证,异常处理

参照：  http://blog.csdn.net/m_buddy/article/details/55193762


4、错误异常处理


1.URLError

# -*- coding: UTF-8 -*-
import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

if __name__ == "__main__":
    #一个不存在的连接
    url = "http://www.douyu.com/Jack_Cui.html"
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
        # html = responese.read()
    except urllib.error.HTTPError as e:
        print(e.code)


运行结果：

C:\Python34\python.exe G:/xiaoshuo2.py

Process finished with exit code 0






# -*- coding: UTF-8 -*-
import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

if __name__ == "__main__":
    #一个不存在的连接
    url = "http://www.douyu.com/Jack_Cui.html"
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')
        print(html)
    except urllib.error.HTTPError as e:
        print(e.code)


运行结果：

C:\Python34\python.exe G:/xiaoshuo2.py


Process finished with exit code 0





import urllib
import urllib
from urllib import request
import re
import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

url = "http://www.douyu.com/Jack_Cui.html"

rep=urllib.request.Request(url)
try:
    data=urllib.request.urlopen(rep)
except urllib.error.URLError as e:
        if hasattr(e,'code'):
            print("HTTPError")
            print(e.code)
        if hasattr(e,'reason' ):
            print("URLError")
            print(e.reason)


输出结果：


C:\Python34\python.exe G:/xiaoshuo2.py
HTTPError
URLError
Forbidden

Process finished with exit code 0





5、python打印防止换行和换行


https://www.cnblogs.com/kfx2007/p/5970784.html


实例：


# coding=utf-8
import re

language = '''''

jenkins

性別：
男d


異名：
(字) 翔宇


爱好：
篮球


籍貫：
广西省桂林市


'''

#获取table中tr值
res_tr = r'(.*?)'
m_tr =  re.findall(res_tr,language,re.S|re.M)
for line in m_tr:
    #获取表格第一列th 属性
    res_th = r'(.*?)'
    m_th = re.findall(res_th,line,re.S|re.M)
    for mm in m_th:
        if "href" in mm: #如果获取加粗的th中含超链接则处理
            restr = r'(.*?)'
            h = re.findall(restr,mm,re.S|re.M)
            print (h[0],end=' ') #逗号连接属性值 防止换行
        else:
            print (mm,end=' ')   #unicode防止乱

    #获取表格第二列td 属性值
    res_td = r'(.*?)'  #r'(.*?)'
    m_td = re.findall(res_td,line,re.S|re.M)
    for nn in m_td:
        if "href" in nn: #处理超链接
            res_value = r'(.*?)'
            m_value = re.findall(res_value,nn,re.S|re.M)
            for value in m_value:
                print (value,end=' ')
        elif "span" in nn: #处理标签
            res_value = r'(.*?)'
            m_value = re.findall(res_value,nn,re.S|re.M) #(字) 翔宇
            for value in m_value:
                print (value,end=' ')
        else:
            print (nn,end=' ')
        print (' ') #换行



C:\Python34\python.exe G:/xiaoshuo2.py
性別： 男  
異名： (字) 翔宇  
爱好： 篮球  
籍貫： 广西省 桂林市  



6、python打印如何呢不换行


https://www.cnblogs.com/hwd9654/p/5707920.html


# -*- coding:utf-8 -*-
import urllib
import re
#import requests
import urllib.parse
import urllib.request
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

class Tool:
    removeImg = re.compile('| {7}|')
    removeAddr = re.compile('|')
    replaceLine = re.compile('||
|')
    replaceTD= re.compile('')
    replacePara = re.compile('')
    replaceBR = re.compile('|')
    removeExtraTag = re.compile('')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replacePara,"\n",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        return x.strip()

class BDTB:
    def __init__(self,baseUrl,seeLZ):
        self.baseURL = baseUrl
        self.seeLZ = '?see_lz='+str(seeLZ)
        self.tool = Tool()
    def getPage(self,pageNum):
        try:
            url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
            request = urllib.request.Request(url)
            response = urllib.request.urlopen(request).read().decode("utf8")
            #print (response)

            return response
        except urllib.error.URLError as e:
            if hasattr(e,"reason"):
                print ("连接百度贴吧失败,错误原因",e.reason)
                return None
    def getTitle(self):
        page = self.getPage(1)
        pattern = re.compile('
关注
打赏
1661842277
查看更多评论
python爬虫前提技术

最近更新

热门博客

[ 申请 ]友情链接：