爬虫数据保存到csv中

xingmeng1 / 2023-07-30 / 原文

import json
import os.path
import time

from jsonpath import *
# import jsonpath as jsonpath
import pandas as pd
import requests



# url = "http://www.whggzy.com/front/search/category"




def get_resp(url,name,i):
    headers = {

        "Referer": "http://www.whggzy.com/PoliciesAndRegulations/index.html?utm=sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept": "*/*",
        "Content-Type": "application/json",
        "X-Requested-With": "XMLHttpRequest"
    }
    data = {
        "utm":"sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993",
        "categoryCode":f"{name}",
        "pageSize":15,
        "pageNo":f"{i}"
    }
    # json = data,json传参就算将参数转化为json格式进行传递的
    resp = requests.post(url, headers=headers, json=data).json()

    return resp

def save_json(content):
    data = json.dumps(content)
    with open("wh_data.json",'w',encoding="utf-8") as w:
        w.write(data)

def get_data(data_list,csv_path,i):
    base_url = 'http://www.whggzy.com/'
    pathName = ''
    for data in data_list:
        pathName = jsonpath(data,'$..pathName')[0] if jsonpath(data,'$..pathName') else None

        title = jsonpath(data,'$..title')[0] if jsonpath(data,'$..title') else None

        publishDate = jsonpath(data,'$..publishDate')[0] if jsonpath(data,'$..publishDate') else None
        date = time.strftime('%Y-%m-%d',time.localtime(publishDate / 1000))
        attachmentUrl = jsonpath(data,'$..attachmentUrl')[0] if jsonpath(data,'$..attachmentUrl') else None
        url  = base_url + jsonpath(data,'$..url')[0] if jsonpath(data,'$..url') else None
        csv_list = [pathName,title,date,attachmentUrl,url]
        save_csv(csv_list,csv_path)


    print(f'政策法规-->>{pathName}-->> 第{i}页下爬取完毕 !!!')



def judge_csv_file():
    # 当前脚本文件的绝对路径,_file_代表的是appLogger 这个文件
    current_path = os.path.abspath(__file__)
    # 定义一个类属性,保存的文件名称
    csv_path = os.path.join(os.path.abspath(os.path.dirname(current_path)),
                            'wh_data.csv')
    print(csv_path)
    if not os.path.exists(csv_path):
        head_list = ['项目','标题','日期','附件网址','内容地址']
        tb_head = ",".join(head_list) + '\n'
        with open(csv_path,'w',encoding="utf-8") as wf:
            wf.write(tb_head)
    return csv_path


def save_csv(data_list,csv_path):
    data = pd.DataFrame(data=[data_list])
    # 追加数据,mode = 'a',表示追加,index=False 表示不给每行数据加索引序号,header=False 表示不加标题
    data.to_csv(csv_path,mode='a',index=False,header=False,encoding='utf-8')

def run(url):
    csv_path = judge_csv_file()
    name_list = ["GovernmentProcurement","BidAndEngineerConstruction","LandAndMineralRightsTransaction",
                 "TransactionOfPropertyRights","TransactionOfPublicResources"]

    for name in name_list:
        i = 1
        while True:
            content = get_resp(url,name,i)
            save_json(content)
            data_list = content['hits']['hits']
            if data_list:
                get_data(data_list,csv_path,i)
            else:
                break
            i += 1

if __name__ == '__main__':
    url = "http://www.whggzy.com/front/search/category"
    run(url)