批量PDF电子发票识别与Excel导出系统

发布于:2025-06-28 ⋅ 阅读:(10) ⋅ 点赞:(0)

批量PDF电子发票识别与Excel导出系统

下面是一个完整的Python解决方案,用于批量识别PDF电子发票并提取关键信息到Excel。系统采用模块化设计,结合OCR和文本解析技术,可处理扫描版和数字版PDF发票。

import os
import re
import pytesseract
import pdf2image
import pandas as pd
from PIL import Image
import numpy as np
from pdfminer.high_level import extract_text
from datetime import datetime
import time
import logging
from concurrent.futures import ThreadPoolExecutor
import fitz  # PyMuPDF

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("invoice_processing.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("InvoiceProcessor")

# 配置Tesseract路径(根据系统修改)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows示例
# pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'  # Linux/macOS

# 发票关键信息正则表达式
PATTERNS = {
   
    'invoice_code': r'发票代码[::\s]*([0-9]{10,12})',
    'invoice_number': r'发票号码[::\s]*([0-9]{8,10})',
    'invoice_date': r'开票日期[::\s]*(\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{1,2}-\d{1,2})',
    'amount': r'金额合计[::\s]*(?:¥|¥|人民币)?\s*([\d,]+\.\d{2})',
    'tax_amount': r'税额合计[::\s]*(?:¥|¥|人民币)?\s*([\d,]+\.\d{2})',
    'total_amount': r'价税合计[::\s]*(?:¥|¥|人民币)?\s*([\d,]+\.\d{2})',
    'seller_name': r'销售方[::]\s*([^\s]{6,40})',
    'seller_tax_id': r'销售方纳税人识别号[::\s]*([0-9A-Za-z]{15,20})',
    'buyer_name': r'购买方[::]\s*([^\s]{6,40})',
    'buyer_tax_id': r'购买方纳税人识别号[::\s]*([0-9A-Za-z]{15,20})'
}

class InvoiceProcessor:
    def __init__(self, pdf_folder, output_excel):
        self.pdf_folder = pdf_folder
        self.output_excel = output_excel
        self.invoice_data = []
        self.processed_files = 0
        self.failed_files = 0
        self.start_time = time.time()
        
    def _convert_pdf_to_images(self, pdf_path, dpi=200):
        """将PDF转换为图像列表"""
        try:
            images = pdf2image.convert_from_path(
                pdf_path,
                dpi=dpi,
                poppler_path=r'C:\Program Files\poppler-23.11.0\Library\bin'  # Windows示例
                # poppler_path='/opt/homebrew/bin'  # macOS示例
            )
            return images
        except Exception as e:
            logger.error(f"PDF转换失败: {
     pdf_path} - {
     str(e)}")
            return []
    
    def _preprocess_image(self, image):
        """图像预处理增强OCR识别率"""
        img = np.array(image)
        # 转换为灰度图
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) > 2 else img
        
        # 二值化处理
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        # 降噪
        denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
        
        return Image.fromarray(denoised)
    
    def _ocr_image(self, image):
        """使用Tesseract进行OCR识别"""
        try:
            # 预处理图像
            processed_img = self._preprocess_image(image)
            
            # 使用Tesseract OCR
            custom_config = r'--oem 3 --psm 6 -l chi_sim+eng'
            text = pytesseract.image_to_string(
                processed_img, 
                config=custom_config
            )
            return text
        except Exception as e:
            logger.error(f"OCR处理失败: {
     str(e)}")
            return ""
    
    def _extract_from_digital_pdf(self, pdf_path):
        """从数字PDF中直接提取文本"""
        try:
            text = extract_text(pdf_path)
            return text
        except Exception as e:
            logger.error(f"数字PDF提取失败: {
     pdf_path} - {
     str(e)}")
            return ""
    
    def _parse_invoice_data(self, text):
        """从文本中解析发票信息"""
        result = {
   }
        for key, pattern in PATTERNS.items():
            match = re.search(pattern, text)
            if match:
                result[key] = match.group(1).strip()
            else:
                result

网站公告

今日签到

点亮在社区的每一天
去签到