[百度热搜] 采集时间优化

This commit is contained in:
DELL 2026-02-27 11:19:21 +08:00
parent d4ac0c27cd
commit feb48f3579

View File

@ -1,16 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import logging
import hashlib import hashlib
import datetime
import re import re
import time import time
from urllib.parse import urlparse
import random import random
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.common.exceptions import ( from selenium.common.exceptions import (
NoSuchElementException,
TimeoutException, TimeoutException,
StaleElementReferenceException, StaleElementReferenceException,
WebDriverException WebDriverException
@ -75,6 +71,7 @@ class BaiduHotSearchSprder(scrapy.Spider):
def start_requests(self): def start_requests(self):
"""发起初始请求""" """发起初始请求"""
self.logger.info(f"开始爬取百度热搜任务ID: {self.job_id if self.job_id else 'N/A'}") self.logger.info(f"开始爬取百度热搜任务ID: {self.job_id if self.job_id else 'N/A'}")
self.url_time = get_current_timestamp()
yield SeleniumRequest( yield SeleniumRequest(
url=self.start_urls, url=self.start_urls,
callback=self.parse, callback=self.parse,
@ -284,10 +281,8 @@ class BaiduHotSearchSprder(scrapy.Spider):
self.logger.warning("标题内容为空,跳过该项") self.logger.warning("标题内容为空,跳过该项")
return None return None
# 2. 初始化所有字段变量
now_ms = int(time.time() * 1000)
# 基础字段 # 基础字段
now_ms = get_current_timestamp()
site_name = '百度热搜' site_name = '百度热搜'
carrier_type = 'hot_search' carrier_type = 'hot_search'
hkey = get_str_md5(title) hkey = get_str_md5(title)
@ -357,9 +352,9 @@ class BaiduHotSearchSprder(scrapy.Spider):
hot_search_item['es_urltitle'] = title hot_search_item['es_urltitle'] = title
hot_search_item['es_urlcontent'] = desc hot_search_item['es_urlcontent'] = desc
hot_search_item['es_carriertype'] = carrier_type hot_search_item['es_carriertype'] = carrier_type
hot_search_item['es_urltime'] = get_current_timestamp() hot_search_item['es_urltime'] = self.url_time
hot_search_item['es_lasttime'] = get_current_timestamp() hot_search_item['es_lasttime'] = now_ms
hot_search_item['es_loadtime'] = get_current_timestamp() hot_search_item['es_loadtime'] = now_ms
hot_search_item['es_hkey'] = hkey hot_search_item['es_hkey'] = hkey
hot_search_item['es_simrank'] = rank hot_search_item['es_simrank'] = rank
hot_search_item['es_heat'] = heat hot_search_item['es_heat'] = heat