python通过pdfminer或pdfminer3k读取pdf文件

python3

pip install pdfminer3k
# -*- encoding: utf-8 -*-

try:
    from urllib.request import urlopen
except:
    from urllib import urlopen

from io import StringIO

from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import  LAParams

# 读取pdf的函数,返回内容
def readPdf(pdf_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams)

    process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file)
    device.close()

    content = retstr.getvalue()
    retstr.close()

    return content


url = "http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf"
pdf_file = urlopen(url) # 也可以换成本地pdf文件,用open rb模式打开
content = readPdf(pdf_file)
print(content)
pdf_file.close()

python2

下载:https://pypi.python.org/pypi/pdfminer/

pip install pdfminer
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def readPdf2(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    with open(path, 'rb') as fp:
        for page in PDFPage.get_pages(fp, set()):
            interpreter.process_page(page)
        text = retstr.getvalue()
    device.close()
    retstr.close()
    return text

text = readPdf2("path")

参考:
用PDFMiner从PDF中提取文本文字

展开阅读全文
©️2020 CSDN 皮肤主题: 编程工作室 设计师: CSDN官方博客 返回首页
实付0元
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值