import time
import requests
from lxml import etree
from 常用功能.mongodb链接 import connect_mongodb
user_value = 'bk_spider'
pwd_value = 'ke@spider01'
mongo_id_value = '119.45.40.170'
port_value = 27017
db_value = 'spider'
coll_value = "channel_directional_crawl"
db_coll = connect_mongodb(user_value, pwd_value, mongo_id_value, port_value, db_value, coll_value)
db = db_coll[0]
coll = db_coll[1]
def get_gaozhong_url(num_list: int):
scool_list = []
for num in range(1, num_list):
scool_list.append(f'https://www.hxx.net/school/gaozhong/list_{num}.html')
return scool_list
def get_chuzhong_url(num_list: int):
scool_list = []
for num in range(1, num_list):
scool_list.append(f'https://www.hxx.net/school/chuzhong/list_{num}.html')
return scool_list
def get_detail(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0',
}
session = requests.Session()
html = session.get(url=url, headers=headers).content.decode()
tree = etree.HTML(html)
second_url_list = tree.xpath("//div[@class='zhongxueSchList']//li//dt/a/@href")
scool_name_list = tree.xpath("//div[@class='zhongxueSchList']//li//dt/a/text()")
for second_url in second_url_list:
num = second_url_list.index(second_url)
print(f'第{num}个')
scool_name = scool_name_list[num]
second_html = session.get(url=second_url, headers=headers).content.decode()
second_tree = etree.HTML(second_html)
# 院校性质
yuanxiaoxingzhi = second_tree.xpath("//dd/span[contains(text(), '院校性质')]/text()")
yuanxiaoxingzhi = ''.join(yuanxiaoxingzhi).replace('院校性质:', '')
# 办学层次
banxuecengci = second_tree.xpath("//dd/span[contains(text(), '办学层次')]/i/text()")
banxuecengci = ''.join(banxuecengci)
# 学校形式
xuexiaoxingshi = second_tree.xpath("//dd/span[contains(text(), '学校形式')]/text()")
xuexiaoxingshi = ''.join(xuexiaoxingshi).replace('学校形式:', '')
# 建校时间jianxiaoshijian
jianxiaoshijian = second_tree.xpath("//div[@class='dl']/dd[2]/span[2]/i/text()")
jianxiaoshijian = ''.join(jianxiaoshijian)
# 学校地址xuexiaodizhi
xuexiaodizhi = second_tree.xpath("//div[@class='dl']/dd[4]/span/text()")
xuexiaodizhi = ''.join(xuexiaodizhi).replace('学校地址:', '')
# 使用 XPath 提取评分信息
pf_div = second_tree.xpath('//div[@class="pf"]')[0] # 找到 class="pf" 的 div 元素
if pf_div is not None:
text = pf_div.xpath('string()').strip() # 获取 div 元素内的文本内容,并去除首尾空白字符
# 使用正则表达式提取分数
import re
pattern = r'环境(\d+\.\d+)分\s*师资(\d+\.\d+)分\s*服务(\d+\.\d+)分\s*效果(\d+\.\d+)分'
match = re.search(pattern, text)
if match:
# 热度
redu=second_tree.xpath('//dt/span/text()')[0]
# 环境评分
huanjingpingfen = match.group(1)
huanjingpingfen = ''.join(huanjingpingfen)
# 师资评分
shizipingfen = match.group(2)
shizipingfen = ''.join(shizipingfen)
# 服务评分
fuwupingfen = match.group(3)
fuwupingfen = ''.join(fuwupingfen)
# 效果评分
xiaoguopingfen = match.group(4)
xiaoguopingfen = ''.join(xiaoguopingfen)
# 学校简介
jianjie = second_tree.xpath(
'//div[@class="schoo_introduction nomargintop"]/div[@class="content"]//text()')
jianjie = ''.join(jianjie).replace('\ue614点击展开','').replace('\u3000','')
dict1 = {
'task_name': '好学校',
'domain': 'https://www.hxx.net/',
'url': 'https://www.hxx.net/',
'ts': int(time.time()),
'tag': '',
'channal': '好学校',
'quality': '非官方',
# 会修改
'batch': 'batch 20240711_batch_1',
# 240711渠道名,会修改
# json/240711/全国各省-城市-中学.json
# 文件类型/渠道名/上传到cos的文件名
# key值,也是文件类型,渠道,标题,后缀的集合
# a.网页端目录命名: html / {网站英文名or缩写or拼音} / {网页title
# or其他可标识内容}.html
# b.视频: videos / {渠道(比如douyin)} / {title}.mp4
# c.图片: IMG / {渠道} / {title}.jpg
# d.PDF: PDF / {渠道} / {title}.pdf
# e.excel: excel / {渠道} / {title}.xlsx
# f.json: json / {渠道} / {title}.json
# cos_url构成:https://data-crawler-1325559378.cos.ap-beijing.myqcloud.com/文件类型/渠道名/上传到cos的文件名
'cos_url': 'https://data-crawler-1325559378.cos.ap-beijing.myqcloud.com/json/240711/全国各省-城市-中学.json',
'meta': {'一级分类': '教育配套',
'二级分类': '学校基础信息',
'三级分类': '全国',
'性质': '非官方',
'信息来源': '好学校',
'说明': '全国各省-城市-中学',
'链接': 'https://www.hxx.net/'
},
'content': {
'学校名称': scool_name,
'详情页链接': second_url,
'院校性质': yuanxiaoxingzhi,
'办学层次': banxuecengci,
'学校形式': xuexiaoxingshi,
'建校时间': jianxiaoshijian,
'学校地址': xuexiaodizhi,
'热度': redu,
'环境评分': huanjingpingfen,
'师资评分': shizipingfen,
'服务评分': fuwupingfen,
'效果评分': xiaoguopingfen,
'ocr_content': f'学校名称:{scool_name},热度:{redu},环境评分:{huanjingpingfen},师资评分:{shizipingfen}服务评分:{fuwupingfen},效果评分:{xiaoguopingfen}'+jianjie,
}
}
# python中mongodb判断某字段的值是否存在
print({"content['学校名称']": scool_name})
count = coll.count_documents({"content.学校名称": scool_name})
print(count)
if count != 0:
print(f"{scool_name}已存在")
print('\n')
else:
# coll.insert_one(dict1)
print(f'{scool_name}插入完成')
print('\n')
if __name__ == '__main__':
gaozhong_url_list = get_gaozhong_url(1103)
chuzhong_url_list = get_chuzhong_url(189)
print(f'gaozhong_url_list:{gaozhong_url_list}')
for gaozhong_url in gaozhong_url_list:
print(f'gaozhong_url:{len(gaozhong_url),gaozhong_url}')
get_detail(gaozhong_url)
# print(f'chuzhong_url_list:{chuzhong_url_list}')
# for chuzhong_url in chuzhong_url_list:
# get_detail(chuzhong_url)