爬取博客文章
1、去除置顶的旧文章内容:
import requests from lxml import etree url = "" r = requests.get(url) selector = etree.HTML(r.text) divs = selector.xpath('//*[@role="article"]') articles = [] for d in divs: day = d.xpath('./*[@class="dayTitle"]/a')[0] title = d.xpath('normalize-space(./*[@class="postTitle"]/a/span)') if '置顶' not in title: href = d.xpath('./*[@class="postTitle"]/a/@href')[0] article = day.text.strip() + ' ' + title.strip() + ' ' + href print(article) articles.append(article)
输出:

2、若要置顶的文章:
html_str = '''''' html = etree.HTML(html_str) # 置顶 result = html.xpath('//div[@class="postTitle"]/a/span/span/text()') text_0 = result[0] # 标题 result = html.xpath('//div[@class="postTitle"]/a/span/span/following-sibling::text()') # 获取当前节点之后的所有同级节点 text = result[0].strip() print(text_0 + text)
输出:
