利用pdfminer.six 和 fiz
import os
from pathlib import Path
from fitz import fitz
from pdfminer.high_level import extract_text
def mu_extract_images(file_name, output_file):
doc = fitz.open(file_name)
img_count = 0
xref_length = doc.xref_length()
print("文件名:{}, 页数: {}, 对象: {}".format(file_name, len(doc), xref_length - 1))
pic_path = Path(output_file) / Path(file_name).stem
if not pic_path.exists():
pic_path.mkdir(parents=True)
os.system('rm ' + '\'' + str(pic_path) + '\'' + '/*')
for page in doc:
try:
img_count += 1
tuple_image = page.get_images()
for xref in list(tuple_image):
xref = list(xref)[0]
img = doc.extract_image(xref)
image_filename = ("%s-%s." % (img_count, xref) + img["ext"])
image_filename = image_filename
image_filename = os.path.join(pic_path, image_filename)
print(image_filename)
img_out = open(image_filename, 'wb')
img_out.write(img["image"])
img_out.close()
except:
continue
if __name__ == '__main__':
# with open("out/3333.txt", 'w') as file:
# file.write(extract_text('3333.pdf'))
mu_extract_images("3333.pdf", "out")