[百度热搜] 采集时间优化
This commit is contained in:
parent
d4ac0c27cd
commit
feb48f3579
@ -1,16 +1,12 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import datetime
|
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from urllib.parse import urlparse
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.common.exceptions import (
|
from selenium.common.exceptions import (
|
||||||
NoSuchElementException,
|
|
||||||
TimeoutException,
|
TimeoutException,
|
||||||
StaleElementReferenceException,
|
StaleElementReferenceException,
|
||||||
WebDriverException
|
WebDriverException
|
||||||
@ -75,6 +71,7 @@ class BaiduHotSearchSprder(scrapy.Spider):
|
|||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
"""发起初始请求"""
|
"""发起初始请求"""
|
||||||
self.logger.info(f"开始爬取百度热搜,任务ID: {self.job_id if self.job_id else 'N/A'}")
|
self.logger.info(f"开始爬取百度热搜,任务ID: {self.job_id if self.job_id else 'N/A'}")
|
||||||
|
self.url_time = get_current_timestamp()
|
||||||
yield SeleniumRequest(
|
yield SeleniumRequest(
|
||||||
url=self.start_urls,
|
url=self.start_urls,
|
||||||
callback=self.parse,
|
callback=self.parse,
|
||||||
@ -284,10 +281,8 @@ class BaiduHotSearchSprder(scrapy.Spider):
|
|||||||
self.logger.warning("标题内容为空,跳过该项")
|
self.logger.warning("标题内容为空,跳过该项")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 2. 初始化所有字段变量
|
|
||||||
now_ms = int(time.time() * 1000)
|
|
||||||
|
|
||||||
# 基础字段
|
# 基础字段
|
||||||
|
now_ms = get_current_timestamp()
|
||||||
site_name = '百度热搜'
|
site_name = '百度热搜'
|
||||||
carrier_type = 'hot_search'
|
carrier_type = 'hot_search'
|
||||||
hkey = get_str_md5(title)
|
hkey = get_str_md5(title)
|
||||||
@ -357,9 +352,9 @@ class BaiduHotSearchSprder(scrapy.Spider):
|
|||||||
hot_search_item['es_urltitle'] = title
|
hot_search_item['es_urltitle'] = title
|
||||||
hot_search_item['es_urlcontent'] = desc
|
hot_search_item['es_urlcontent'] = desc
|
||||||
hot_search_item['es_carriertype'] = carrier_type
|
hot_search_item['es_carriertype'] = carrier_type
|
||||||
hot_search_item['es_urltime'] = get_current_timestamp()
|
hot_search_item['es_urltime'] = self.url_time
|
||||||
hot_search_item['es_lasttime'] = get_current_timestamp()
|
hot_search_item['es_lasttime'] = now_ms
|
||||||
hot_search_item['es_loadtime'] = get_current_timestamp()
|
hot_search_item['es_loadtime'] = now_ms
|
||||||
hot_search_item['es_hkey'] = hkey
|
hot_search_item['es_hkey'] = hkey
|
||||||
hot_search_item['es_simrank'] = rank
|
hot_search_item['es_simrank'] = rank
|
||||||
hot_search_item['es_heat'] = heat
|
hot_search_item['es_heat'] = heat
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user