Python 解析图片和文字

guanchaoguo / 2023-09-01 / 原文

利用pdfminer.six 和 fiz

import os
from pathlib import Path

from fitz import fitz
from pdfminer.high_level import extract_text


def mu_extract_images(file_name, output_file):
    doc = fitz.open(file_name)
    img_count = 0
    xref_length = doc.xref_length()
    print("文件名:{}, 页数: {}, 对象: {}".format(file_name, len(doc), xref_length - 1))

    pic_path = Path(output_file) / Path(file_name).stem
    if not pic_path.exists():
        pic_path.mkdir(parents=True)
    os.system('rm ' + '\'' + str(pic_path) + '\'' + '/*')
    for page in doc:
        try:
            img_count += 1
            tuple_image = page.get_images()
            for xref in list(tuple_image):
                xref = list(xref)[0]
                img = doc.extract_image(xref)
                image_filename = ("%s-%s." % (img_count, xref) + img["ext"])
                image_filename = image_filename
                image_filename = os.path.join(pic_path, image_filename)
                print(image_filename)
                img_out = open(image_filename, 'wb')
                img_out.write(img["image"])
                img_out.close()
        except:
            continue


if __name__ == '__main__':
    # with open("out/3333.txt", 'w') as file:
    #     file.write(extract_text('3333.pdf'))
    mu_extract_images("3333.pdf", "out")