#!/usr/local/bin/python3
# -*- encoding: utf-8 -*-
import requests
from lxml import etree
import os
from PIL import Image
import shutil
def get_doc_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
}
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
html = response.text
all_doc_page = etree.HTML(html)
doc_url = all_doc_page.xpath("//div[@class='list-bd']//div[@class='title fl']/a/@href")
return doc_url
except Exception as e:
print("【请求失败,请检查URL和网络环境!】")
print(e)
def jpg2pdf(doc_id):
image_list = []
for i in range(len(os.listdir(os.path.join("output", doc_id)))):
image_path = os.path.join("output", doc_id, f"{i}.jpg")
img = Image.open(image_path)
if img.mode != "RGB":
img = img.convert("RGB")
image_list.append(img)
pdf_path = os.path.join("output", doc_id, f"{doc_id}.pdf")
image_list[0].save(pdf_path, "PDF", resolution=100.0, save_all=True, append_images=image_list[1:])
if os.path.exists(f"output/{doc_id}/{doc_id}.pdf"):
# 转换成功,删除原jpg文件
print(f"{doc_id} 【转换为pdf成功!】")
os.system(f"del /s /q output\\{doc_id}\\*.jpg")
print(f"{doc_id} 文件夹图片已删除")
else:
# 转换失败
print(f"{doc_id} 【转换失败,请检查!】")
def get_jpg(doc_url):
for i in doc_url:
url = "https://www.51jiaoxi.com/" + i[1:]
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
}
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
doc_id = url.split("-")[1].split(".")[0]
jpg_url = html.xpath("//div[@class='img-box']/img/@src")
jpg_url_split = jpg_url[0].split("/")
jpg_url_prefix = f"https://{jpg_url_split[2]}/{jpg_url_split[3]}/{jpg_url_split[4]}/{jpg_url_split[5]}"
start_page_num = int(jpg_url[0].split(".")[2].split("/")[-1])
show_page_num = len(jpg_url)
no_show_page_num = html.xpath("//div[@class='remain-previews-inner']/span/span/text()")[0]
all_page_num = int(show_page_num) + int(no_show_page_num)
if not os.path.exists("output"):
os.mkdir("output")
if not os.path.exists(os.path.join("output", doc_id)):
os.mkdir(os.path.join("output", doc_id))
print("\n正在下载试卷: {}...".format(doc_id))
for j in range(start_page_num, int(all_page_num)):
jpg_url = f"{jpg_url_prefix}/0/{j}.jpg?x-oss-process=image/crop,h_1044,g_center/format,webp"
response = requests.get(url=jpg_url, headers=headers)
with open(os.path.join("output", doc_id, f"{j}.jpg"), "wb") as f:
f.write(response.content)
print("下载完成!")
print("正在转换为pdf...")
jpg2pdf(doc_id)
print(f"已完成: 【{doc_url.index(i) + 1}/{len(doc_url)}】")
print("\n全部下载完毕!\n")
def main():
url = input("\n请输入成套试卷链接: ")
while "album" not in url:
print("【不是成套试卷链接,请重新输入!】")
url = input("\n请输入成套试卷链接: ")
doc_url = get_doc_url(url)
if doc_url:
get_jpg(doc_url)
else:
print("无法获取试卷链接,请检查输入的链接是否正确。")
if __name__ == "__main__":
main()