pdf文件流或者本地文件读取

发布于:2025-08-15 ⋅ 阅读:(11) ⋅ 点赞:(0)

import requests
import pdfplumber
import io
import os
def read_pdf(path,proxies={},timeout=(3.2,10),download_image=False,headers={}):
    f=''
    if path=='' or type(path)!=str:
        print("路径为空或格式不对!")
    if path[0:4]=="http":
        try:
            #data=request.urlopen(path,timeout=10).read()
            print(proxies)
            print(headers)
            data=requests.get(url=path,timeout=timeout,proxies=proxies,headers=headers)
            print(data)
            with open('99.pdf','wb+') as f2:
                f2.write(data.content)
            f = io.BytesIO(data.content)
        except Exception as e:
            print(e,"打开链接失败")
            return ''
    else:
        try:
            path=urllib.parse.unquote(path)
            path=path.replace('file:///','').replace('/','\\')
            f=open(path,'rb')
        except Exception as e:
            print(e,"打开本地文件失败")

    text=''
    old_path=os.getcwd()
    if download_image:
        im_path=path.replace('https://','').replace("http://",'')
        os.makedirs(im_path, exist_ok=True)
        os.chdir(im_path)
    
    with pdfplumber.open(f) as pdf:
    # 遍历每个页面
        for page in pdf.pages:
            # 获取当前页面的全部文本信息,包括表格中的文字,没有内容则打印None
            text+=page.extract_text()
            if download_image:
                images=page.images
                i=0
                for img in images:
                    f_img=open('{}.png'.format(i),'wb+')
                    f_img.write(img['stream'].get_data())
                    f_img.close()
                    i+=1
    os.chdir(old_path)
    f.close()
    return text


url='https://www.airuniversity.af.edu/Portals/10/ASPJ/journals/Volume-27_Issue-6/V-Soine-Harker-Heminger-Scherrer.pdf'
proxies={'http':'192.168.1.122:1080','https':'192.168.1.122:1080'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0',\
         #'cookie':'.ASPXANONYMOUS=txzams-d65KJ-plLxuK7ohwAwg6cmLo5fJzCbD3i4CaFY7sH2EYFw2jvP3bw64iuYiVJbpNQBxFLir7s-_8p65Huzw9Sab4REdtsGyvLi4E0hge-0; _ga=GA1.1.509430065.1753846772; dnn_IsMobile=False; ARRAffinity=7604675fe895ac43d4eee5ed64a571e723c5cb50da2e00ebe078cb3d6f359b1c; _ga_CSLL4ZEK4L=GS2.1.s1755051758$o5$g1$t1755052205$j60$l0$h0; _ga_313558765=GS2.1.s1755051759$o5$g1$t1755052205$j60$l0$h0',\
         'sec-fetch-dest':'document'}
read_pdf(url,proxies=proxies,headers=headers)
 


网站公告

今日签到

点亮在社区的每一天
去签到