您当前的位置: 首页 >  Python

python通过pdfminer或pdfminer3k读取pdf文件

彭世瑜 发布时间:2018-04-12 18:48:22 ,浏览量:2

python3
pip install pdfminer3k
# -*- encoding: utf-8 -*-

try:
    from urllib.request import urlopen
except:
    from urllib import urlopen

from io import StringIO

from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import  LAParams

# 读取pdf的函数,返回内容
def readPdf(pdf_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams)

    process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file)
    device.close()

    content = retstr.getvalue()
    retstr.close()

    return content


url = "http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf"
pdf_file = urlopen(url) # 也可以换成本地pdf文件,用open rb模式打开
content = readPdf(pdf_file)
print(content)
pdf_file.close()
python2

下载:https://pypi.python.org/pypi/pdfminer/

pip install pdfminer
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def readPdf2(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    with open(path, 'rb') as fp:
        for page in PDFPage.get_pages(fp, set()):
            interpreter.process_page(page)
        text = retstr.getvalue()
    device.close()
    retstr.close()
    return text

text = readPdf2("path")

参考: 用PDFMiner从PDF中提取文本文字

关注
打赏
1688896170
查看更多评论

彭世瑜

暂无认证

  • 2浏览

    0关注

    2727博文

    0收益

  • 0浏览

    0点赞

    0打赏

    0留言

私信
关注
热门博文
立即登录/注册

微信扫码登录

0.2212s