[今日头条] 标题采集优化
This commit is contained in:
parent
5b3bf034f8
commit
592e8b6cfb
@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import time
|
import time
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
@ -303,6 +304,12 @@ class HotSearchRedisSpider(scrapy.Spider):
|
|||||||
driver.get(news_url)
|
driver.get(news_url)
|
||||||
self._wait_for_page_load(driver)
|
self._wait_for_page_load(driver)
|
||||||
|
|
||||||
|
# 标题采集
|
||||||
|
try:
|
||||||
|
title = driver.find_elements(By.XPATH, "//div[@class='article-content']/h1")[0].text.strip()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'标题采集失败,已使用热搜名称...')
|
||||||
|
title = hot_item['hot_word']
|
||||||
# 提取页面信息
|
# 提取页面信息
|
||||||
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
|
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
|
||||||
content = self._extract_content(driver)
|
content = self._extract_content(driver)
|
||||||
@ -318,7 +325,7 @@ class HotSearchRedisSpider(scrapy.Spider):
|
|||||||
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
|
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
|
||||||
even_details_item['es_sitename'] = '今日头条'
|
even_details_item['es_sitename'] = '今日头条'
|
||||||
even_details_item['es_sid'] = get_str_md5(news_url)
|
even_details_item['es_sid'] = get_str_md5(news_url)
|
||||||
even_details_item['es_urltitle'] = author
|
even_details_item['es_urltitle'] = title
|
||||||
even_details_item['es_authors'] = author
|
even_details_item['es_authors'] = author
|
||||||
even_details_item['es_urlcontent'] = content
|
even_details_item['es_urlcontent'] = content
|
||||||
even_details_item['es_urltime'] = url_time
|
even_details_item['es_urltime'] = url_time
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user