[今日头条] 标题采集优化
This commit is contained in:
parent
5b3bf034f8
commit
592e8b6cfb
@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
@ -303,6 +304,12 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
driver.get(news_url)
|
||||
self._wait_for_page_load(driver)
|
||||
|
||||
# 标题采集
|
||||
try:
|
||||
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
|
||||
except Exception as e:
|
||||
logging.error(f'标题采集失败,已使用热搜名称...')
|
||||
title = hot_item['hot_word']
|
||||
# 提取页面信息
|
||||
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
|
||||
content = self._extract_content(driver)
|
||||
@ -318,7 +325,7 @@ class HotSearchRedisSpider(scrapy.Spider):
|
||||
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
|
||||
even_details_item['es_sitename'] = '今日头条'
|
||||
even_details_item['es_sid'] = get_str_md5(news_url)
|
||||
even_details_item['es_urltitle'] = author
|
||||
even_details_item['es_urltitle'] = title
|
||||
even_details_item['es_authors'] = author
|
||||
even_details_item['es_urlcontent'] = content
|
||||
even_details_item['es_urltime'] = url_time
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user