从不同书站采集书籍
import json
import logging
import re
import time
from flask import Flask, jsonify
import requests
from lxml import etree
from redis_ip import redis_ip
import pymongo
app = Flask(__name__)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
# 设置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 连接 MongoDB 数据库
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["book_database"]
collection = db["bookapi"]
def requests_page(url):
try:
logging.info(f"正在请求页面: {url}")
res = requests.get(url=url, headers=headers, proxies=redis_ip.ip(), timeout=10)
res.encoding = 'utf-8'
logging.info(f"成功获取页面内容,状态码: {res.status_code}")
if res.status_code == 200:
return res.text
if res.status_code == 403:
return '触发反爬'
if res.status_code == 404:
return '没有资源'
if res.status_code == 500:
return '源网址信号不稳定,请稍后再试'
else:
return None
except requests.exceptions.RequestException as e:
logging.error(f"请求失败:{e}")
return None
# 纯数字和有汉字的返回True,英文或者英文加数字或者英文加数字加符号返回False,其他返回参数错误
def is_chinese(text):
has_chinese = False
has_number = False
has_english = False
has_symbol = False
for char in text:
if '\u4e00' <= char <= '\u9fff':
has_chinese = True
elif char.isdigit():
has_number = True
elif ('a' <= char <= 'z') or ('A' <= char <= 'Z'):
has_english = True
else:
has_symbol = True
if has_chinese or (has_number and not has_english and not has_symbol):
return True
elif has_english or (has_english and has_number) or (has_english and has_number and has_symbol):
return False
else:
return "参数错误"
def get_isbnsearch_details(isbn, origin_url):
res = requests_page(origin_url)
if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试':
logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(isbnsearch)。")
return None
try:
logging.info(f"正在解析 isbnsearch 网站 ISBN 为 {isbn} 的页面内容")
tree = etree.HTML(res)
title = ''.join(tree.xpath('//div[@class="bookinfo"]//h1//text()'))
img_urls = ''.join(tree.xpath('//div[@class="image"]//img//@src'))
author = tree.xpath('//div[@class="bookinfo"]//p[3]//text()')[-1]
binding = tree.xpath('//div[@class="bookinfo"]//p[4]//text()')[-1]
publisher = tree.xpath('//div[@class="bookinfo"]//p[5]//text()')[-1]
published_date = tree.xpath('//div[@class="bookinfo"]//p[6]//text()')[-1]
return {
'title': title,
'origin_url': origin_url,
'img_urls': img_urls,
'isbn': isbn,
'author': author,
'binding': binding,
'formatted_author': author.strip(), # 格式化作者信息,去除首尾空格
'publisher': publisher,
'published_date': published_date,
'origin': 'isbnsearch',
}
except Exception as e:
logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(isbnsearch):{e}")
time.sleep(10)
return None
def get_nicebooks_details(isbn, origin_url):
res = requests_page(origin_url)
if res is None or '<title>Search books by ISBN · NiceBooks</title>' in res or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试':
logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(nicebooks)。")
return None
try:
logging.info(f"正在解析 nickbooks 网站 ISBN 为 {isbn} 的页面内容")
tree = etree.HTML(res)
title = ''.join(tree.xpath('//a[@class="title"]//text()'))
img_urls = ''.join(tree.xpath('//div[@class="small-6 medium-2 columns"]//a//@src'))
author = ''.join(tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[2]//text()')).replace('\n',
'').replace(
' ', '').replace('by', '').strip()
publisher = ''.join(tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[4]//text()'))
published_date = tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[5]//text()')[0]
binding = tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[5]//text()')[1]
return {
'title': title,
'origin_url': origin_url,
'img_urls': img_urls,
'isbn': isbn,
'author': author,
'binding': binding,
'formatted_author': author.strip(),
'publisher': publisher,
'published_date': published_date,
'origin': 'nicebooks',
}
except Exception as e:
logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(nicebooks):{e}")
time.sleep(10)
return None
def get_bookuu_details(isbn, origin_url):
res = requests_page(origin_url)
if res is None or '<title>图书批发一站式图书批发平台 - 馆配图书平台 - 博库网批发平台</title>' in res or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试':
logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(bookuu)。")
return None
try:
logging.info(f"正在解析 bookuu 网站 ISBN 为 {isbn} 的页面内容")
title = res['title']
img_urls = res['img_urls']
author = res['author']
publisher = res['publisher']
published_date = res['published_date']
binding = res['binding']
return {
'title': title,
'origin_url': origin_url,
'img_urls': img_urls,
'isbn': isbn,
'author': author,
'binding': binding,
'formatted_author': author.strip(),
'publisher': publisher,
'published_date': published_date,
'origin': '博库网',
}
except Exception as e:
logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(bookuu):{e}")
time.sleep(10)
return res
def get_kongfz_details(isbn, origin_url):
res = requests_page(origin_url)
if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试' or '"totalFound":0' in res:
logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(kongfz)。")
return None
try:
logging.info(f"正在解析 kongfz 网站 ISBN 为 {isbn} 的页面内容")
res = json.loads(res)
second_url_data = res['data']['itemResponse']['list'][0]
title = second_url_data['title']
author = second_url_data['author']
publisher = second_url_data['press']
published_date = second_url_data['pubDateText']
img_urls = second_url_data['imgUrl']
binding = second_url_data['tplRecords']['binding'][-1]['value']
return {
'title': title,
'origin_url': origin_url,
'img_urls': img_urls,
'isbn': isbn,
'author': author,
'binding': binding,
'formatted_author': author.strip(),
'publisher': publisher,
'published_date': published_date,
'origin': '孔夫子旧书网',
}
except Exception as e:
logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(kongfz):{e}")
time.sleep(10)
return None
def get_douban_details(isbn, origin_url):
res = requests_page(origin_url)
if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试' or '读书 - 豆瓣搜索</title>' in res:
logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(douban)。")
return None
try:
logging.info(f"正在解析 douban 网站 ISBN 为 {isbn} 的页面内容")
second_url = ''.join(re.findall('"url": "(.*?)"}], "report"', res))
second_html = requests_page(second_url)
second_tree = etree.HTML(second_html)
all_msg = ''.join(second_tree.xpath('//div[@id="info"]//text()')).replace('\n', '').strip()
all_msg = re.sub(r'\s+', ' ', all_msg)
title = second_tree.xpath('//div[@id="wrapper"]//h1//text()')[1]
img_urls = ''.join(second_tree.xpath(f'//img[@alt="{title}"]//@src'))
author = ''.join(re.findall('作者: (.*?) ', all_msg))
publisher = ''.join(re.findall('出版社: (.*?) ', all_msg))
fanyi = ''.join(re.findall('译者: (.*?) ', all_msg))
fubiaoti = ''.join(re.findall('副标题: (.*?) ', all_msg))
yuanzuoming = ''.join(re.findall('原作名: (.*?) ', all_msg))
published_date = ''.join(re.findall('出版年: (.*?) ', all_msg))
yeshu = ''.join(re.findall('页数: (.*?) ', all_msg))
price = ''.join(re.findall('定价: (.*?) ', all_msg))
binding = ''.join(re.findall('装帧: (.*?) ', all_msg))
return {
'title': title,
'origin_url': origin_url,
'img_urls': img_urls,
'isbn': isbn,
'author': author,
'binding': binding,
'formatted_author': author.strip(),
'publisher': publisher,
'fanyi': fanyi,
'fubiaoti': fubiaoti,
'yuanzuoming': yuanzuoming,
'yeshu': yeshu,
'price': price,
'published_date': published_date,
'origin': '豆瓣网',
}
except Exception as e:
logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(douban):{e}")
time.sleep(10)
return None
@app.route('/bookapi/<isbn>')
def get_book_info(isbn):
# 先从数据库中查找书籍信息
book_data = None
try:
book_data = collection.find_one({"isbn": isbn})
except Exception as db_error:
logging.error(f"数据库查询出现错误:{db_error}")
if book_data:
logging.info(f"从数据库中获取 ISBN 为 {isbn} 的书籍信息")
return_data = {
key: book_data.get(key, '') for key in [
"title", "origin_url", "product_descritpion", "author", "publisher", "published_date", "img_urls",
"isbn", "binding", "format", "isItASet", "paper", "classification", "type", "content_description",
"author_introduction", "catalogue", "preface", "online_trial_reading", "media_comments", "book_url",
"time", "seriestitle", "isbn10", "price", "genus", "levelNum", "heatNum", "edition", "yinci",
"language", "keyword", "fanyi",
"fubiaoti", "yuanzuoming", "yeshu","origin"
]
}
return jsonify(return_data)
else:
book_details = None
if is_chinese(str(isbn)):
print(f'纯数字和有汉字的参数,使用以博库网为首的中文源')
origin_url_list = [
f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1',
f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001',
f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000',
f'https://isbnsearch.org/isbn/{isbn}',
f'https://us.nicebooks.com/search/isbn?isbn={isbn}'
]
else:
print(f'英文参数,使用以 nicebooks 为首的英文源')
origin_url_list = [
f'https://us.nicebooks.com/search/isbn?isbn={isbn}',
f'https://isbnsearch.org/isbn/{isbn}',
f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1',
f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001',
f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000',
]
for origin_url in origin_url_list:
try:
if origin_url == f'https://isbnsearch.org/isbn/{isbn}':
book_details = get_isbnsearch_details(isbn, origin_url)
if book_details is not None:
book_details['origin_url'] = origin_url
break
if origin_url == f'https://us.nicebooks.com/search/isbn?isbn={isbn}':
book_details = get_nicebooks_details(isbn, origin_url)
if book_details is not None:
book_details['origin_url'] = origin_url
break
if origin_url == f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000':
book_details = get_kongfz_details(isbn, origin_url)
if book_details is not None:
book_details['origin_url'] = origin_url
break
if origin_url == f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1':
book_details = get_bookuu_details(isbn, origin_url)
if book_details is not None:
book_details['origin_url'] = origin_url
break
if origin_url == f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001':
book_details = get_douban_details(isbn, origin_url)
if book_details is not None:
book_details['origin_url'] = origin_url
break
except Exception as func_error:
logging.error(f"处理 URL {origin_url} 时出现错误:{func_error}")
if book_details is None:
return jsonify({"error": "无法找到书籍信息。"}), 404
else:
print(f'book_details:{book_details}')
# 将获取到的书籍信息存储到数据库
try:
collection.insert_one(book_details)
logging.info(f"将 ISBN 为 {isbn} 的书籍信息存储到数据库")
except Exception as insert_error:
logging.error(f"数据库插入出现错误:{insert_error}")
return_data = {
key: book_details.get(key, '') for key in [
"title", "origin_url", "product_descritpion", "author", "publisher", "published_date",
"img_urls", "isbn", "binding", "format", "isItASet", "paper", "classification", "type",
"content_description", "author_introduction", "catalogue", "preface",
"online_trial_reading", "media_comments", "book_url", "time", "seriestitle", "isbn10",
"price", "genus", "levelNum", "heatNum", "edition", "yinci", "language", "keyword", "fanyi",
"fubiaoti", "yuanzuoming", "yeshu","origin"
]
}
return jsonify(return_data)
if __name__ == '__main__':
app.run()