[百度热搜] 采集时间优化
This commit is contained in:
parent
d4ac0c27cd
commit
feb48f3579
@ -1,16 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import logging
|
||||
import hashlib
|
||||
import datetime
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
import random
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import (
|
||||
NoSuchElementException,
|
||||
TimeoutException,
|
||||
StaleElementReferenceException,
|
||||
WebDriverException
|
||||
@ -75,6 +71,7 @@ class BaiduHotSearchSprder(scrapy.Spider):
|
||||
def start_requests(self):
|
||||
"""发起初始请求"""
|
||||
self.logger.info(f"开始爬取百度热搜,任务ID: {self.job_id if self.job_id else 'N/A'}")
|
||||
self.url_time = get_current_timestamp()
|
||||
yield SeleniumRequest(
|
||||
url=self.start_urls,
|
||||
callback=self.parse,
|
||||
@ -284,10 +281,8 @@ class BaiduHotSearchSprder(scrapy.Spider):
|
||||
self.logger.warning("标题内容为空,跳过该项")
|
||||
return None
|
||||
|
||||
# 2. 初始化所有字段变量
|
||||
now_ms = int(time.time() * 1000)
|
||||
|
||||
# 基础字段
|
||||
now_ms = get_current_timestamp()
|
||||
site_name = '百度热搜'
|
||||
carrier_type = 'hot_search'
|
||||
hkey = get_str_md5(title)
|
||||
@ -357,9 +352,9 @@ class BaiduHotSearchSprder(scrapy.Spider):
|
||||
hot_search_item['es_urltitle'] = title
|
||||
hot_search_item['es_urlcontent'] = desc
|
||||
hot_search_item['es_carriertype'] = carrier_type
|
||||
hot_search_item['es_urltime'] = get_current_timestamp()
|
||||
hot_search_item['es_lasttime'] = get_current_timestamp()
|
||||
hot_search_item['es_loadtime'] = get_current_timestamp()
|
||||
hot_search_item['es_urltime'] = self.url_time
|
||||
hot_search_item['es_lasttime'] = now_ms
|
||||
hot_search_item['es_loadtime'] = now_ms
|
||||
hot_search_item['es_hkey'] = hkey
|
||||
hot_search_item['es_simrank'] = rank
|
||||
hot_search_item['es_heat'] = heat
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user