PDF OCR 优化建议

Question

import sys
import fitz # PyMuPDF
from paddleocr import PaddleOCR
from PIL import Image
import io
import traceback
import numpy as np
import re
import cv2
from concurrent.futures import ThreadPoolExecutor
import os
import logging

配置日志

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

清理文本中的噪声字符

def clean_text(text):
cleaned_text = re.sub(r'--- (Page \d+|OCR Text from Image \d+) ---', '', text)
cleaned_text = re.sub(r'[^A-Za-z0-9\u4e00-\u9fa5]+', ' ', cleaned_text)
return ' '.join(cleaned_text.split())

提取PDF页面文本

def extract_text_from_page(page_num, doc, output_txt):
ocr = PaddleOCR(use_angle_cls=True, lang='ch') # 每线程独立实例化
try:
page = doc.load_page(page_num)
text = page.get_text("text")
text_content = clean_text(text)

text
    # 提取图像并进行OCR处理
    img_list = page.get_images(full=True)
    for img_index, img in enumerate(img_list):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # 图像预处理
            image = np.array(image)
            if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA -> RGB
                image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
            gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            _, binarized_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)

            # 进行OCR识别
            result = ocr.ocr(binarized_image)
            if result:
                ocr_text = "\n".join([line[1][0] for line in result[0] if line[1][0].strip()])
                ocr_text = clean_text(ocr_text)
                if ocr_text:
                    text_content += f"\n--- OCR Text from Image {img_index + 1} ---\n{ocr_text}"

        except Exception as img_error:
            logging.warning(f"Error processing image {img_index + 1} on page {page_num + 1}: {img_error}")
            logging.debug(traceback.format_exc())

    # 将提取的内容追加到输出文件
    with open(output_txt, "a", encoding="utf-8") as txt_file:
        txt_file.write(text_content)

except Exception as e:
    logging.error(f"Error processing page {page_num + 1}: {e}")
    logging.debug(traceback.format_exc())

提取整个PDF的文本，支持多线程

def extract_text_from_pdf(pdf_file, output_txt):
try:
doc = fitz.open(pdf_file)
total_pages = doc.page_count
max_threads = int(os.cpu_count() * 1.5) # 最大线程数

text
    # 清空输出文件
    with open(output_txt, "w", encoding="utf-8"):
        pass

    # 使用线程池进行多线程处理
    with ThreadPoolExecutor(max_threads) as executor:
        for page_num in range(total_pages):
            logging.info(f"Submitting task for page {page_num + 1}/{total_pages}")
            executor.submit(extract_text_from_page, page_num, doc, output_txt)

    logging.info("Processing complete!")

except Exception as e:
    logging.error(f"Error processing PDF file {pdf_file}: {e}")
    logging.debug(traceback.format_exc())

命令行让用户输入文件路径

def get_file_paths_from_user():
# 获取用户输入的PDF文件路径
pdf_file = input("请输入PDF文件路径：").strip()
if not os.path.exists(pdf_file) or not pdf_file.lower().endswith('.pdf'):
logging.error("无效的PDF文件路径。请重新输入有效路径。")
return None, None

text
# 获取用户输入的输出文件路径
output_txt = input("请输入输出文本文件路径：").strip()
if not output_txt.endswith('.txt'):
    output_txt += '.txt'

# 检查输出文件的父目录
output_dir = os.path.dirname(output_txt)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)  # 自动创建父目录

return pdf_file, output_txt

启动程序

def main():
pdf_file, output_txt = get_file_paths_from_user()
if pdf_file and output_txt:
extract_text_from_pdf(pdf_file, output_txt)

if name == "main":
main()
改进这个代码实现处理pdf的功能

Answer 1

您的代码已经很好地实现了从PDF中提取文本和图像进行OCR的功能。为了进一步改进和优化代码，我将提出以下几点建议，并提供一个优化后的完整代码示例：

改进建议

优化PaddleOCR实例化:
- 每个线程中实例化PaddleOCR可能会导致资源浪费和性能下降。可以考虑在主线程中创建一个共享的PaddleOCR实例，并使用线程安全的队列来分配OCR任务。
线程安全的文件写入:
- 当前多线程直接写入同一个文件可能会导致数据竞争和写入冲突。可以使用线程锁（threading.Lock）来确保写入操作的原子性，或者收集所有结果后在主线程中统一写入。
保持页面顺序:
- 多线程处理页面时，页面处理顺序可能会打乱。可以通过收集所有页面的结果后按页码排序再写入，或者使用concurrent.futures的as_completed结合页码进行排序。
改进图像预处理:
- 根据具体PDF的特点，可以调整图像预处理步骤以提高OCR的准确性。例如，可以尝试不同的二值化方法或图像增强技术。
增强日志记录:
- 增加更多的日志信息，特别是处理进度和性能指标，以便更好地监控程序的运行情况。
错误处理优化:
- 提供更多的错误上下文信息，确保在出错时能够更容易地调试和定位问题。
支持更多语言:
- 如果需要处理多语言的PDF，可以让用户选择OCR的语言。
资源管理:
- 确保所有资源（如文件句柄）都能被正确关闭，避免资源泄漏。

优化后的代码示例

以下是根据上述建议优化后的代码：

python
import sys
import fitz  # PyMuPDF
from paddleocr import PaddleOCR
from PIL import Image
import io
import traceback
import numpy as np
import re
import cv2
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import logging
import threading

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 清理文本中的噪声字符
def clean_text(text):
    cleaned_text = re.sub(r'--- (Page \d+|OCR Text from Image \d+) ---', '', text)
    cleaned_text = re.sub(r'[^A-Za-z0-9\u4e00-\u9fa5]+', ' ', cleaned_text)
    return ' '.join(cleaned_text.split())

# 提取PDF页面文本
def extract_text_from_page(page_num, doc, ocr, lock, results):
    try:
        page = doc.load_page(page_num)
        text = page.get_text("text")
        text_content = clean_text(text)

        # 提取图像并进行OCR处理
        img_list = page.get_images(full=True)
        for img_index, img in enumerate(img_list):
            try:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))

                # 图像预处理
                image = np.array(image)
                if len(image.shape) == 3 and image.shape[2] == 4:  # RGBA -> RGB
                    image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                _, binarized_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)

                # 进行OCR识别
                result = ocr.ocr(binarized_image, rec=True, cls=True)
                if result:
                    ocr_text = "\n".join([line[1][0] for line in result if line[1][0].strip()])
                    ocr_text = clean_text(ocr_text)
                    if ocr_text:
                        text_content += f"\n--- OCR Text from Image {img_index + 1} ---\n{ocr_text}"

            except Exception as img_error:
                logging.warning(f"Error processing image {img_index + 1} on page {page_num + 1}: {img_error}")
                logging.debug(traceback.format_exc())

        # 将提取的内容存储到结果字典中
        with lock:
            results[page_num] = text_content

    except Exception as e:
        logging.error(f"Error processing page {page_num + 1}: {e}")
        logging.debug(traceback.format_exc())

# 提取整个PDF的文本，支持多线程
def extract_text_from_pdf(pdf_file, output_txt, ocr_lang='ch'):
    try:
        doc = fitz.open(pdf_file)
        total_pages = doc.page_count
        max_threads = min(int(os.cpu_count() * 1.5), total_pages)  # 最大线程数不超过总页数

        # 清空输出文件
        with open(output_txt, "w", encoding="utf-8"):
            pass

        # 初始化OCR实例
        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)

        # 使用线程池进行多线程处理
        results = {}
        lock = threading.Lock()

        with ThreadPoolExecutor(max_threads) as executor:
            futures = []
            for page_num in range(total_pages):
                logging.info(f"Submitting task for page {page_num + 1}/{total_pages}")
                futures.append(executor.submit(extract_text_from_page, page_num, doc, ocr, lock, results))

            # 等待所有任务完成
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    logging.error(f"Exception in thread: {e}")
                    logging.debug(traceback.format_exc())

        # 按页码排序并写入文件
        with open(output_txt, "a", encoding="utf-8") as txt_file:
            for page_num in range(total_pages):
                txt_file.write(results.get(page_num, ''))
                txt_file.write("\n\n")

        logging.info("Processing complete!")

    except Exception as e:
        logging.error(f"Error processing PDF file {pdf_file}: {e}")
        logging.debug(traceback.format_exc())

# 命令行让用户输入文件路径
def get_file_paths_from_user():
    # 获取用户输入的PDF文件路径
    pdf_file = input("请输入PDF文件路径：").strip()
    if not os.path.exists(pdf_file) or not pdf_file.lower().endswith('.pdf'):
        logging.error("无效的PDF文件路径。请重新输入有效路径。")
        return None, None

    # 获取用户输入的输出文件路径
    output_txt = input("请输入输出文本文件路径：").strip()
    if not output_txt.endswith('.txt'):
        output_txt += '.txt'

    # 检查输出文件的父目录
    output_dir = os.path.dirname(output_txt)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)  # 自动创建父目录

    return pdf_file, output_txt

# 启动程序
def main():
    pdf_file, output_txt = get_file_paths_from_user()
    if pdf_file and output_txt:
        # 让用户选择OCR语言
        ocr_lang = input("请输入OCR语言代码（默认 'ch'）：").strip() or 'ch'
        extract_text_from_pdf(pdf_file, output_txt, ocr_lang=ocr_lang)

if __name__ == "__main__":
    main()

优化点详细说明

共享PaddleOCR实例:
- 将PaddleOCR实例在主线程中创建，并作为参数传递给每个线程。这避免了每个线程重复创建OCR实例，节省资源。
线程安全的结果存储:
- 使用一个共享的字典results和线程锁lock来存储每个页面的提取结果，确保多线程环境下的数据一致性。
按页码排序写入:
- 所有页面处理完成后，按照页码顺序将结果写入输出文件，保持文本的逻辑顺序。
限制线程数:
- 将最大线程数限制为min(int(os.cpu_count() * 1.5), total_pages)，避免创建超过总页数的线程，提高资源利用率。
增强的日志记录:
- 增加了更多的日志信息，例如提交任务时的日志，处理完成后的日志等，便于监控处理进度。
支持选择OCR语言:
- 添加了用户输入OCR语言代码的选项，增强了代码的灵活性。
错误处理优化:
- 在主线程中捕获线程异常，避免某个线程的错误影响整个程序的运行。
改进图像预处理:
- 根据具体需求，可以进一步优化图像预处理步骤。例如，调整二值化阈值、添加图像去噪等。
代码组织优化:
- 通过参数传递和函数分工，使代码结构更加清晰，易于维护和扩展。

进一步的优化建议

使用异步IO:
- 对于I/O密集型任务，可以考虑使用asyncio来进一步提升性能。
缓存机制:
- 如果需要处理大量相似的图像，可以引入缓存机制，避免重复OCR。
多语言支持:
- 可以扩展支持多种语言的OCR，根据页面内容自动切换语言模型。
图形用户界面（GUI）:
- 为非技术用户提供更友好的操作界面，例如使用tkinter或PyQt创建简单的GUI。
进度条显示:
- 使用tqdm等库显示处理进度，提升用户体验。
配置文件:
- 将一些可配置参数（如OCR语言、线程数等）移到配置文件中，增加灵活性。

希望这些改进和优化能帮助您更高效地处理PDF文件。如有进一步的问题或需要更多功能的实现，欢迎继续交流！

Answer 2

完整的中英双语PDF OCR处理工具

为了满足您的需求，我将提供一个完整的命令行工具，用于处理PDF文件，提取文本和对图像进行OCR识别，支持中英文。以下内容包括：

所需模块及安装指南
完整的Python脚本
使用说明

1. 所需模块及安装指南

在开始之前，请确保您已经安装了以下Python模块。如果尚未安装，可以使用pip进行安装。

必要的Python模块

PyMuPDF (fitz): 用于处理PDF文件。
PaddleOCR: 用于OCR识别。
Pillow (PIL): 用于图像处理。
OpenCV (cv2): 用于图像预处理。
numpy: 用于数值计算。
tqdm: 用于显示进度条。
argparse: 用于解析命令行参数。
concurrent.futures: 用于多线程处理。
logging: 用于日志记录。

安装命令

打开终端或命令提示符，运行以下命令安装所需模块：

bash
pip install PyMuPDF paddleocr pillow opencv-python numpy tqdm

注意：argparse, concurrent.futures, 和 logging 是Python的标准库模块，无需额外安装。

2. 完整的Python脚本

以下是优化后的完整Python脚本，支持中英双语OCR，并通过命令行接口运行。

python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import fitz  # PyMuPDF
from paddleocr import PaddleOCR
from PIL import Image
import io
import traceback
import numpy as np
import re
import cv2
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import logging
import threading
import argparse
from tqdm import tqdm

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 清理文本中的噪声字符
def clean_text(text):
    cleaned_text = re.sub(r'--- (Page \d+|OCR Text from Image \d+) ---', '', text)
    cleaned_text = re.sub(r'[^A-Za-z0-9\u4e00-\u9fa5]+', ' ', cleaned_text)
    return ' '.join(cleaned_text.split())

# 提取PDF页面文本
def extract_text_from_page(page_num, doc, ocr, lock, results):
    try:
        page = doc.load_page(page_num)
        text = page.get_text("text")
        text_content = clean_text(text)

        # 提取图像并进行OCR处理
        img_list = page.get_images(full=True)
        for img_index, img in enumerate(img_list):
            try:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))

                # 图像预处理
                image = np.array(image)
                if len(image.shape) == 3:
                    if image.shape[2] == 4:  # RGBA -> RGB
                        image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
                    else:
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                _, binarized_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)

                # 进行OCR识别
                result = ocr.ocr(binarized_image, rec=True, cls=True)
                if result:
                    ocr_text = "\n".join([line[1][0] for line in result if line[1][0].strip()])
                    ocr_text = clean_text(ocr_text)
                    if ocr_text:
                        text_content += f"\n--- OCR Text from Image {img_index + 1} ---\n{ocr_text}"

            except Exception as img_error:
                logging.warning(f"Error processing image {img_index + 1} on page {page_num + 1}: {img_error}")
                logging.debug(traceback.format_exc())

        # 将提取的内容存储到结果字典中
        with lock:
            results[page_num] = text_content

    except Exception as e:
        logging.error(f"Error processing page {page_num + 1}: {e}")
        logging.debug(traceback.format_exc())

# 提取整个PDF的文本，支持多线程
def extract_text_from_pdf(pdf_file, output_txt, ocr_lang='ch'):
    try:
        doc = fitz.open(pdf_file)
        total_pages = doc.page_count
        max_threads = min(int(os.cpu_count() * 1.5), total_pages)  # 最大线程数不超过总页数

        # 清空输出文件
        with open(output_txt, "w", encoding="utf-8"):
            pass

        # 初始化OCR实例
        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)

        # 使用线程池进行多线程处理
        results = {}
        lock = threading.Lock()

        with ThreadPoolExecutor(max_threads) as executor:
            futures = []
            for page_num in range(total_pages):
                futures.append(executor.submit(extract_text_from_page, page_num, doc, ocr, lock, results))

            # 使用tqdm显示进度条
            for _ in tqdm(as_completed(futures), total=total_pages, desc="Processing Pages"):
                pass

        # 按页码排序并写入文件
        with open(output_txt, "a", encoding="utf-8") as txt_file:
            for page_num in range(total_pages):
                txt_file.write(f"--- Page {page_num + 1} ---\n")
                txt_file.write(results.get(page_num, ''))
                txt_file.write("\n\n")

        logging.info("Processing complete!")

    except Exception as e:
        logging.error(f"Error processing PDF file {pdf_file}: {e}")
        logging.debug(traceback.format_exc())

# 解析命令行参数
def parse_arguments():
    parser = argparse.ArgumentParser(description="PDF OCR处理工具，支持中英文。")
    parser.add_argument('pdf_file', type=str, help='输入的PDF文件路径')
    parser.add_argument('output_txt', type=str, help='输出的文本文件路径')
    parser.add_argument('--lang', type=str, default='ch', choices=['ch', 'en', 'ch_en'],
                        help='OCR语言，支持中英文（ch_en），默认中英文（ch）')
    return parser.parse_args()

# 启动程序
def main():
    args = parse_arguments()
    pdf_file = args.pdf_file
    output_txt = args.output_txt
    ocr_lang = args.lang

    # 验证PDF文件路径
    if not os.path.exists(pdf_file) or not pdf_file.lower().endswith('.pdf'):
        logging.error("无效的PDF文件路径。请提供一个存在的PDF文件。")
        sys.exit(1)

    # 处理输出文件路径
    if not output_txt.lower().endswith('.txt'):
        output_txt += '.txt'

    # 检查输出文件的父目录
    output_dir = os.path.dirname(output_txt)
    if output_dir and not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)  # 自动创建父目录
            logging.info(f"已创建输出目录: {output_dir}")
        except Exception as e:
            logging.error(f"无法创建输出目录 {output_dir}: {e}")
            sys.exit(1)

    # 调整OCR语言参数
    if ocr_lang == 'ch_en':
        ocr_lang = 'ch'
        logging.info("OCR语言设置为中英混合识别。")

    logging.info(f"开始处理PDF文件: {pdf_file}")
    extract_text_from_pdf(pdf_file, output_txt, ocr_lang=ocr_lang)

if __name__ == "__main__":
    main()

3. 使用说明

3.1. 脚本保存与权限

将上述脚本保存为 pdf_ocr.py。为了在命令行中直接运行脚本，您可以为脚本添加执行权限（适用于Unix/Linux系统）：

bash
chmod +x pdf_ocr.py

3.2. 运行脚本

打开终端或命令提示符，导航到脚本所在的目录，运行以下命令：

bash
python pdf_ocr.py 输入的PDF文件路径 输出的文本文件路径 [--lang 语言代码]

参数说明：

输入的PDF文件路径：要处理的PDF文件的完整路径。
输出的文本文件路径：处理后的文本文件的保存路径。
--lang：可选参数，指定OCR识别的语言。支持：
- ch：中文（默认）
- en：英文
- ch_en：中英文混合

示例：

处理中英文混合的PDF并输出到指定文本文件：

bash
python pdf_ocr.py /path/to/input.pdf /path/to/output.txt --lang ch_en

只处理中文的PDF：

bash
python pdf_ocr.py input.pdf output.txt --lang ch

只处理英文的PDF：

bash
python pdf_ocr.py input.pdf output.txt --lang en

3.3. 脚本运行过程

初始化：
- 验证输入的PDF文件路径是否存在且为PDF格式。
- 检查并创建输出文件的父目录（如果不存在）。
OCR处理：
- 使用多线程加速处理，每个线程处理一个页面。
- 对每个页面：
  - 提取文本内容并清理噪声字符。
  - 提取页面中的所有图像，进行预处理（灰度化、二值化）。
  - 使用PaddleOCR对预处理后的图像进行OCR识别，提取文本。
  - 将提取的文本和OCR结果整合。
结果输出：
- 所有页面处理完成后，按页码顺序将结果写入输出文本文件。
- 在命令行中显示处理进度。
完成：
- 处理完成后，日志会显示“Processing complete!”。

3.4. 其他注意事项

PaddleOCR模型下载：
- 第一次运行PaddleOCR时，会自动下载所需的模型文件，请确保您的网络连接正常。
性能优化：
- 脚本自动根据CPU核心数调整线程池大小，您可以根据需要调整代码中的max_threads计算方式以优化性能。
日志信息：
- 脚本使用logging模块记录信息、警告和错误，便于调试和监控处理过程。
错误处理：
- 脚本包含详细的异常处理机制，确保在处理过程中遇到错误时不会中断整个流程，并记录相关错误信息。

4. 示例

假设您有一个名为 sample.pdf 的PDF文件，位于 /Users/username/Documents/ 目录，您希望将处理后的文本保存为 output.txt，可以运行以下命令：

bash
python pdf_ocr.py /Users/username/Documents/sample.pdf /Users/username/Documents/output.txt --lang ch_en

在终端中，您将看到类似以下的输出：

text
2024-04-27 12:00:00,000 - INFO - 开始处理PDF文件: /Users/username/Documents/sample.pdf
Processing Pages: 100%|████████████████████████████████████████| 200/200 [10:00<00:00,  3.33it/s]
2024-04-27 12:10:00,000 - INFO - Processing complete!

处理完成后，output.txt 文件将包含提取的文本和OCR识别的图像文本，按页码顺序排列。

5. 结语

通过以上步骤，您可以使用提供的Python脚本高效地处理PDF文件，提取文本并进行中英文OCR识别。脚本利用多线程和图像预处理技术，确保在处理复杂PDF时具有良好的性能和准确性。如果您有进一步的需求或问题，欢迎继续交流！

Answer 3

您好！

您在运行 pdf2.py 脚本时遇到了与 protobuf 版本相关的错误。具体错误信息如下：

text
TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).
More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

问题分析

该错误提示表明当前安装的 protobuf 版本与 PaddleOCR 及其依赖库不兼容。具体来说，protobuf 版本过高，导致 PaddleOCR 无法正确加载其生成的 _pb2.py 文件。

解决方案

根据错误信息的建议，您有以下几种解决方案：

降级 protobuf 到 3.20.x 或更低版本。
设置环境变量 PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python。

通常，第一种方法更为推荐，因为它能够保持更好的性能和兼容性。以下是详细的步骤：

步骤 1：检查当前 `protobuf` 版本

首先，检查您当前安装的 protobuf 版本，以确认是否需要降级。

bash
pip show protobuf

输出示例：

text
Name: protobuf
Version: 4.22.3
...

如果版本号高于 3.20.x，则需要降级。

步骤 2：降级 `protobuf` 到 3.20.x

使用以下命令将 protobuf 降级到 3.20.3（这是一个稳定且广泛使用的版本）：

bash
pip install protobuf==3.20.3

如果您使用的是 conda 环境，也可以使用 conda 进行安装：

bash
conda install protobuf=3.20.3

步骤 3：验证安装

确保 protobuf 已成功降级到 3.20.3：

bash
pip show protobuf

应输出：

text
Name: protobuf
Version: 3.20.3
...

步骤 4：重新运行脚本

现在，尝试重新运行您的 pdf2.py 脚本：

bash
python pdf2.py

可选步骤：创建虚拟环境

为了避免未来出现类似的依赖冲突，建议您为项目创建一个独立的虚拟环境。以下是使用 venv 创建和激活虚拟环境的步骤：

创建虚拟环境
```
bash
python -m venv pdf_ocr_env
```

激活虚拟环境

Windows:
```
bash
pdf_ocr_env\Scripts\activate
```
macOS/Linux:
```
bash
source pdf_ocr_env/bin/activate
```

在虚拟环境中安装依赖

bash
pip install --upgrade pip
pip install PyMuPDF paddleocr pillow opencv-python numpy tqdm protobuf==3.20.3

运行脚本
```
bash
python pdf2.py
```

其他解决方案

如果您不希望降级 protobuf，还可以尝试以下方法之一：

方法 1：设置环境变量

将环境变量 PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION 设置为 python。这将使用纯 Python 实现，虽然速度较慢，但可以避免版本冲突。

临时设置（仅当前终端有效）：

Windows:

bash
set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

macOS/Linux:

bash
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

永久设置：

根据操作系统的不同，可以将上述命令添加到启动脚本（如 .bashrc、.bash_profile 或 Windows 的系统环境变量）中。

然后，重新运行脚本：

bash
python pdf2.py

方法 2：升级相关依赖

检查是否有更新版本的 PaddleOCR 或其依赖库，可能已经解决了与 protobuf 的兼容性问题。您可以尝试升级 PaddleOCR：

bash
pip install --upgrade paddleocr

总结

最直接和推荐的解决方案是 降级 protobuf 到 3.20.3。这可以确保与 PaddleOCR 及其依赖库的兼容性，并避免出现上述错误。

如果您在执行这些步骤后仍遇到问题，请提供更多的错误信息，以便进一步协助您解决问题。

祝您成功！

Answer 4

您好！

针对您遇到的 protobuf 版本兼容性问题，以下是降级 protobuf 到 3.20.3 版本的具体命令和步骤：

1. 使用 `pip` 降级 `protobuf`

如果您使用的是 pip 作为包管理工具，可以按照以下步骤操作：

步骤 1：卸载当前版本的 `protobuf`

首先，卸载现有的 protobuf 版本，以确保降级顺利进行。

bash
pip uninstall protobuf

您可能需要确认卸载操作，输入 y 并按下回车键。

步骤 2：安装指定版本的 `protobuf`

安装 protobuf 的 3.20.3 版本：

bash
pip install protobuf==3.20.3

2. 使用 `conda` 降级 `protobuf`（如果您使用的是 Anaconda）

如果您使用的是 conda 作为包管理工具，可以按照以下步骤操作：

bash
conda install protobuf=3.20.3

3. 验证安装的 `protobuf` 版本

安装完成后，您可以通过以下命令验证 protobuf 的版本，确保已成功降级：

bash
pip show protobuf

预期输出应类似如下，其中 Version 显示为 3.20.3：

text
Name: protobuf
Version: 3.20.3
...

4. 推荐：使用虚拟环境

为了避免未来可能出现的依赖冲突，建议您为项目创建一个独立的虚拟环境。以下是使用 venv 创建和激活虚拟环境的步骤：

步骤 1：创建虚拟环境

bash
python -m venv pdf_ocr_env

步骤 2：激活虚拟环境

Windows：
```
bash
pdf_ocr_env\Scripts\activate
```
macOS/Linux：
```
bash
source pdf_ocr_env/bin/activate
```

激活后，您的命令提示符前会出现 (pdf_ocr_env)，表示当前处于虚拟环境中。

步骤 3：在虚拟环境中安装所需依赖

bash
pip install --upgrade pip
pip install PyMuPDF paddleocr pillow opencv-python numpy tqdm protobuf==3.20.3

5. 重新运行脚本

在完成上述步骤后，尝试重新运行您的 pdf2.py 脚本：

bash
python pdf2.py

6. 其他解决方案（备用）

如果您不希望降级 protobuf，还可以尝试以下方法之一：

方法 1：设置环境变量

将环境变量 PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION 设置为 python。这将使用纯 Python 实现，虽然速度较慢，但可以避免版本冲突。

临时设置（仅当前终端有效）：

Windows：

bash
set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

macOS/Linux：

bash
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

永久设置：

根据操作系统的不同，可以将上述命令添加到启动脚本（如 .bashrc、.bash_profile 或 Windows 的系统环境变量）中。

设置完成后，重新运行脚本：

bash
python pdf2.py

方法 2：升级相关依赖

检查是否有更新版本的 PaddleOCR 或其依赖库，可能已经解决了与 protobuf 的兼容性问题。您可以尝试升级 PaddleOCR：

bash
pip install --upgrade paddleocr

总结

最直接和推荐的解决方案是 降级 protobuf 到 3.20.3。这可以确保与 PaddleOCR 及其依赖库的兼容性，并避免出现您当前遇到的错误。

如果您在执行这些步骤后仍遇到问题，请提供更多的错误信息，以便进一步协助您解决问题。

祝您成功！

Answer 5

您好！

您在运行脚本时遇到了以下警告：

text
WARNING:root:Error processing image 3 on page 3: 'NoneType' object is not subscriptable
WARNING:root:Error processing image 3 on page 5: 'NoneType' object is not subscriptable

问题分析

这个错误通常发生在尝试对 None 对象进行下标操作时。在您的代码中，可能是在处理 OCR 结果时，ocr.ocr 返回了 None 或者返回的结果格式与预期不符，导致后续的下标访问失败。

可能的原因

OCR 识别失败：某些图像可能无法被正确识别，导致 ocr.ocr 返回 None 或空结果。
图像预处理问题：图像预处理步骤可能导致图像质量下降，使得 OCR 无法正确处理。
图像提取问题：提取的图像可能为 None 或损坏，导致后续处理失败。

解决方案

1. 增强错误处理

在处理 OCR 结果时，添加更多的检查以确保结果有效。例如，检查 result 是否为 None，以及每个识别结果的结构是否符合预期。

2. 调整图像预处理

优化图像预处理步骤，确保图像质量适合 OCR 识别。可以尝试不同的二值化阈值或其他图像增强技术。

3. 日志详细信息

记录更多的日志信息，帮助您了解哪些图像处理失败以及失败的具体原因。

具体实现

以下是对您的脚本进行改进的建议和代码示例：

python
import sys
import fitz  # PyMuPDF
from paddleocr import PaddleOCR
from PIL import Image
import io
import traceback
import numpy as np
import re
import cv2
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import logging
import threading
import argparse
from tqdm import tqdm

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 清理文本中的噪声字符
def clean_text(text):
    cleaned_text = re.sub(r'--- (Page \d+|OCR Text from Image \d+) ---', '', text)
    cleaned_text = re.sub(r'[^A-Za-z0-9\u4e00-\u9fa5]+', ' ', cleaned_text)
    return ' '.join(cleaned_text.split())

# 提取PDF页面文本
def extract_text_from_page(page_num, doc, ocr, lock, results):
    try:
        page = doc.load_page(page_num)
        text = page.get_text("text")
        text_content = clean_text(text)

        # 提取图像并进行OCR处理
        img_list = page.get_images(full=True)
        for img_index, img in enumerate(img_list):
            try:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image.get("image")
                if image_bytes is None:
                    logging.warning(f"Image bytes is None for image {img_index + 1} on page {page_num + 1}")
                    continue
                image = Image.open(io.BytesIO(image_bytes))

                # 图像预处理
                image = np.array(image)
                if len(image.shape) == 3:
                    if image.shape[2] == 4:  # RGBA -> RGB
                        image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
                    else:
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                _, binarized_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)

                # 进行OCR识别
                result = ocr.ocr(binarized_image, rec=True, cls=True)
                if result and isinstance(result, list):
                    ocr_text = "\n".join([line[1][0] for line in result if len(line) > 1 and line[1][0].strip()])
                    ocr_text = clean_text(ocr_text)
                    if ocr_text:
                        text_content += f"\n--- OCR Text from Image {img_index + 1} ---\n{ocr_text}"
                else:
                    logging.warning(f"OCR result is empty or invalid for image {img_index + 1} on page {page_num + 1}")

            except Exception as img_error:
                logging.warning(f"Error processing image {img_index + 1} on page {page_num + 1}: {img_error}")
                logging.debug(traceback.format_exc())

        # 将提取的内容存储到结果字典中
        with lock:
            results[page_num] = text_content

    except Exception as e:
        logging.error(f"Error processing page {page_num + 1}: {e}")
        logging.debug(traceback.format_exc())

# 提取整个PDF的文本，支持多线程
def extract_text_from_pdf(pdf_file, output_txt, ocr_lang='ch'):
    try:
        doc = fitz.open(pdf_file)
        total_pages = doc.page_count
        max_threads = min(int(os.cpu_count() * 1.5), total_pages)  # 最大线程数不超过总页数

        # 清空输出文件
        with open(output_txt, "w", encoding="utf-8"):
            pass

        # 初始化OCR实例
        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)

        # 使用线程池进行多线程处理
        results = {}
        lock = threading.Lock()

        with ThreadPoolExecutor(max_threads) as executor:
            futures = []
            for page_num in range(total_pages):
                futures.append(executor.submit(extract_text_from_page, page_num, doc, ocr, lock, results))

            # 使用tqdm显示进度条
            for _ in tqdm(as_completed(futures), total=total_pages, desc="Processing Pages"):
                pass

        # 按页码排序并写入文件
        with open(output_txt, "a", encoding="utf-8") as txt_file:
            for page_num in range(total_pages):
                txt_file.write(f"--- Page {page_num + 1} ---\n")
                txt_file.write(results.get(page_num, ''))
                txt_file.write("\n\n")

        logging.info("Processing complete!")

    except Exception as e:
        logging.error(f"Error processing PDF file {pdf_file}: {e}")
        logging.debug(traceback.format_exc())

# 解析命令行参数
def parse_arguments():
    parser = argparse.ArgumentParser(description="PDF OCR处理工具，支持中英文。")
    parser.add_argument('pdf_file', type=str, help='输入的PDF文件路径')
    parser.add_argument('output_txt', type=str, help='输出的文本文件路径')
    parser.add_argument('--lang', type=str, default='ch', choices=['ch', 'en', 'ch_en'],
                        help='OCR语言，支持中英文（ch_en），默认中英文（ch）')
    return parser.parse_args()

# 启动程序
def main():
    args = parse_arguments()
    pdf_file = args.pdf_file
    output_txt = args.output_txt
    ocr_lang = args.lang

    # 验证PDF文件路径
    if not os.path.exists(pdf_file) or not pdf_file.lower().endswith('.pdf'):
        logging.error("无效的PDF文件路径。请提供一个存在的PDF文件。")
        sys.exit(1)

    # 处理输出文件路径
    if not output_txt.lower().endswith('.txt'):
        output_txt += '.txt'

    # 检查输出文件的父目录
    output_dir = os.path.dirname(output_txt)
    if output_dir and not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)  # 自动创建父目录
            logging.info(f"已创建输出目录: {output_dir}")
        except Exception as e:
            logging.error(f"无法创建输出目录 {output_dir}: {e}")
            sys.exit(1)

    # 调整OCR语言参数
    if ocr_lang == 'ch_en':
        ocr_lang = 'ch'
        logging.info("OCR语言设置为中英混合识别。")

    logging.info(f"开始处理PDF文件: {pdf_file}")
    extract_text_from_pdf(pdf_file, output_txt, ocr_lang=ocr_lang)

if __name__ == "__main__":
    main()

改进点详细说明

增加对 image_bytes 的检查：在提取图像后，检查 image_bytes 是否为 None。如果为 None，记录警告并跳过该图像。

python
image_bytes = base_image.get("image")
if image_bytes is None:
    logging.warning(f"Image bytes is None for image {img_index + 1} on page {page_num + 1}")
    continue

验证 OCR 结果的有效性：在处理 OCR 结果前，确保 result 不为 None，并且是一个列表。同时，确保每个识别结果的结构符合预期。

python
if result and isinstance(result, list):
    ocr_text = "\n".join([line[1][0] for line in result if len(line) > 1 and line[1][0].strip()])
    ocr_text = clean_text(ocr_text)
    if ocr_text:
        text_content += f"\n--- OCR Text from Image {img_index + 1} ---\n{ocr_text}"
else:
    logging.warning(f"OCR result is empty or invalid for image {img_index + 1} on page {page_num + 1}")

增强日志记录：记录更多的详细信息，帮助您排查问题。例如，记录哪些图像处理失败，以及失败的具体原因。
图像预处理优化：根据需要，您可以进一步优化图像预处理步骤。例如，调整二值化阈值、应用图像去噪等，以提高 OCR 的准确性。

进一步的建议

验证图像是否包含文本：在进行 OCR 之前，您可以使用一些图像分析技术来判断图像中是否包含文本，从而避免不必要的 OCR 处理。
限制 OCR 识别的区域：如果您知道文本通常位于图像的特定区域，可以裁剪图像以仅包含这些区域，减少 OCR 的负担并提高准确性。
使用更强大的图像增强技术：根据具体的 PDF 文件特点，尝试使用不同的图像增强技术，如自适应阈值、对比度调整等，以改善 OCR 的效果。
捕获并处理更多异常：在 OCR 过程中，捕获并处理更多可能的异常，以确保脚本在遇到意外情况时不会中断。

示例运行

假设您已经保存了改进后的脚本为 pdf_ocr.py，可以通过以下命令运行：

bash
python pdf_ocr.py /path/to/input.pdf /path/to/output.txt --lang ch_en

在终端中，您将看到处理进度和日志信息。如果某些图像处理失败，会记录相应的警告信息，帮助您定位问题。

总结

通过增强错误处理和日志记录，您可以更好地理解和解决当前遇到的问题。同时，优化图像预处理步骤可以提高 OCR 的准确性和稳定性。如果问题依然存在，请提供更多的日志信息，以便进一步协助您解决问题。

祝您成功！

Вопрос

配置日志

清理文本中的噪声字符

提取PDF页面文本

提取整个PDF的文本，支持多线程

命令行让用户输入文件路径

启动程序

Мысль

Ответ

改进建议

优化后的代码示例

优化点详细说明

进一步的优化建议

Вопрос

Мысль

Ответ

完整的中英双语PDF OCR处理工具

1. 所需模块及安装指南

必要的Python模块

安装命令

2. 完整的Python脚本

3. 使用说明

3.1. 脚本保存与权限

3.2. 运行脚本

3.3. 脚本运行过程

3.4. 其他注意事项

4. 示例

5. 结语

Вопрос

Мысль

Ответ

问题分析

解决方案

步骤 1：检查当前 protobuf 版本

步骤 2：降级 protobuf 到 3.20.x

步骤 3：验证安装

步骤 4：重新运行脚本

可选步骤：创建虚拟环境

其他解决方案

方法 1：设置环境变量

方法 2：升级相关依赖

总结

Вопрос

Мысль

Ответ

1. 使用 pip 降级 protobuf

步骤 1：卸载当前版本的 protobuf

步骤 2：安装指定版本的 protobuf

2. 使用 conda 降级 protobuf（如果您使用的是 Anaconda）

3. 验证安装的 protobuf 版本

4. 推荐：使用虚拟环境

步骤 1：创建虚拟环境

步骤 2：激活虚拟环境

步骤 3：在虚拟环境中安装所需依赖

5. 重新运行脚本

6. 其他解决方案（备用）

方法 1：设置环境变量

方法 2：升级相关依赖

总结

Вопрос

Мысль

Ответ

问题分析

可能的原因

解决方案

1. 增强错误处理

2. 调整图像预处理

3. 日志详细信息

具体实现

改进点详细说明

进一步的建议

示例运行

总结

Поделиться этим Q&A

步骤 1：检查当前 `protobuf` 版本

步骤 2：降级 `protobuf` 到 3.20.x

1. 使用 `pip` 降级 `protobuf`

步骤 1：卸载当前版本的 `protobuf`

步骤 2：安装指定版本的 `protobuf`

2. 使用 `conda` 降级 `protobuf`（如果您使用的是 Anaconda）

3. 验证安装的 `protobuf` 版本