[百度热搜] 采集时间优化

This commit is contained in:
DELL 2026-02-27 11:19:21 +08:00
parent d4ac0c27cd
commit feb48f3579

View File

@ -1,16 +1,12 @@
# -*- coding: utf-8 -*-
import json
import logging
import hashlib
import datetime
import re
import time
from urllib.parse import urlparse
import random
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
NoSuchElementException,
TimeoutException,
StaleElementReferenceException,
WebDriverException
@ -75,6 +71,7 @@ class BaiduHotSearchSprder(scrapy.Spider):
def start_requests(self):
"""发起初始请求"""
self.logger.info(f"开始爬取百度热搜任务ID: {self.job_id if self.job_id else 'N/A'}")
self.url_time = get_current_timestamp()
yield SeleniumRequest(
url=self.start_urls,
callback=self.parse,
@ -284,10 +281,8 @@ class BaiduHotSearchSprder(scrapy.Spider):
self.logger.warning("标题内容为空,跳过该项")
return None
# 2. 初始化所有字段变量
now_ms = int(time.time() * 1000)
# 基础字段
now_ms = get_current_timestamp()
site_name = '百度热搜'
carrier_type = 'hot_search'
hkey = get_str_md5(title)
@ -357,9 +352,9 @@ class BaiduHotSearchSprder(scrapy.Spider):
hot_search_item['es_urltitle'] = title
hot_search_item['es_urlcontent'] = desc
hot_search_item['es_carriertype'] = carrier_type
hot_search_item['es_urltime'] = get_current_timestamp()
hot_search_item['es_lasttime'] = get_current_timestamp()
hot_search_item['es_loadtime'] = get_current_timestamp()
hot_search_item['es_urltime'] = self.url_time
hot_search_item['es_lasttime'] = now_ms
hot_search_item['es_loadtime'] = now_ms
hot_search_item['es_hkey'] = hkey
hot_search_item['es_simrank'] = rank
hot_search_item['es_heat'] = heat