爬取博客文章

jeyeshield / 2023-07-18 / 原文

1、去除置顶的旧文章内容:

import requests
from lxml import etree

url = ""

r = requests.get(url)
selector = etree.HTML(r.text)
divs = selector.xpath('//*[@role="article"]')

articles = []
for d in divs:
    day = d.xpath('./*[@class="dayTitle"]/a')[0]
    title = d.xpath('normalize-space(./*[@class="postTitle"]/a/span)')
    if '置顶' not in title:
        href = d.xpath('./*[@class="postTitle"]/a/@href')[0]
        article = day.text.strip() + ' ' + title.strip() + ' ' + href
        print(article)
        articles.append(article)

输出:

 

2、若要置顶的文章:

html_str = ''''''

html = etree.HTML(html_str)
# 置顶
result = html.xpath('//div[@class="postTitle"]/a/span/span/text()')
text_0 = result[0]
# 标题
result = html.xpath('//div[@class="postTitle"]/a/span/span/following-sibling::text()')  # 获取当前节点之后的所有同级节点
text = result[0].strip()

print(text_0 + text)

输出: