[今日头条] 采集关联新闻
This commit is contained in:
parent
feb48f3579
commit
d01035577b
@ -76,7 +76,7 @@ class SeleniumMiddleware:
|
|||||||
}
|
}
|
||||||
edge_options.add_experimental_option("prefs", prefs)
|
edge_options.add_experimental_option("prefs", prefs)
|
||||||
|
|
||||||
self.driver = Edge(executable_path=r"C:\Program Files\Python38\msedgedriver.exe", options=edge_options)
|
self.driver = Edge(executable_path=r"D:\msedgedriver.exe", options=edge_options)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler):
|
def from_crawler(cls, crawler):
|
||||||
|
|||||||
@ -45,6 +45,7 @@ class BaiduHotSearchSprder(scrapy.Spider):
|
|||||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||||
},
|
},
|
||||||
|
'DOWNLOADER_MIDDLEWARES': {},
|
||||||
'BATCH_SAVE_SIZE': 50
|
'BATCH_SAVE_SIZE': 50
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,295 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy_selenium import SeleniumRequest
|
||||||
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.remote.webelement import WebElement
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
|
||||||
|
from MediaSpiders.items import MediaspidersItem
|
||||||
|
from MediaSpiders.utils.date_utils import get_time_stamp
|
||||||
|
from MediaSpiders.utils.hot_search_json_parser import url_response
|
||||||
|
from MediaSpiders.utils.string_utils import get_str_md5
|
||||||
|
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||||
|
|
||||||
|
|
||||||
|
class HotSearchRedisSpider(scrapy.Spider):
|
||||||
|
name = 'HotSearchRedisSpider'
|
||||||
|
|
||||||
|
custom_settings = {
|
||||||
|
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||||||
|
'PROTO_CLASS_NAME': 'EsSets',
|
||||||
|
'PROTO_FIELD_NAME': 'Es',
|
||||||
|
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||||||
|
'IMAGES_STORE': r'/usr/local/temp_image/twitter',
|
||||||
|
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||||
|
'FILES_STORE': r'/usr/local/videos',
|
||||||
|
'FILES_RESULT_FIELD': 'es_video',
|
||||||
|
'ZIP_FILE_NAME': 'image_data_ship_',
|
||||||
|
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||||
|
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||||
|
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
|
||||||
|
},
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||||
|
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 常量定义
|
||||||
|
TOUTIAO_HOT_URL = 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
|
||||||
|
BAIDU_URL = 'https://www.toutiao.com/'
|
||||||
|
PAGE_LOAD_TIMEOUT = 10
|
||||||
|
ELEMENT_WAIT_TIMEOUT = 5
|
||||||
|
MAX_NEWS_PER_HOT = 6
|
||||||
|
MAX_HOT_ITEMS = 10
|
||||||
|
|
||||||
|
# 选择器定义
|
||||||
|
URL_SELECTORS = [
|
||||||
|
'.l-content a',
|
||||||
|
'.feed-card-wtt-l p a',
|
||||||
|
'.feed-card-article-l a'
|
||||||
|
]
|
||||||
|
|
||||||
|
AUTHOR_SELECTORS = [
|
||||||
|
"//div[@class='author-info']/div[@class='desc']/a[@class='name']",
|
||||||
|
"//div[@class='user-info']/a[@class='user-name']"
|
||||||
|
]
|
||||||
|
|
||||||
|
CONTENT_SELECTORS = [
|
||||||
|
"//div[@class='article-content']//p",
|
||||||
|
"//article/div[@class='weitoutiao-html']"
|
||||||
|
]
|
||||||
|
|
||||||
|
TIME_SELECTORS = [
|
||||||
|
"//p[@class='abstract']/span[@class='time']",
|
||||||
|
"//div[@class='article-meta']/span[1]"
|
||||||
|
]
|
||||||
|
|
||||||
|
# 需要过滤的文本模式
|
||||||
|
SKIP_PATTERNS = ['版权', '声明', '邮箱', '记者', '编辑', '来源', '投稿', '责任编辑']
|
||||||
|
|
||||||
|
def __init__(self, params=None, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.url_time = get_current_timestamp()
|
||||||
|
self.total_num = 0
|
||||||
|
self.authorization = None
|
||||||
|
self.job_id = None
|
||||||
|
|
||||||
|
if params:
|
||||||
|
try:
|
||||||
|
json_params = json.loads(params)
|
||||||
|
self.total_num = int(json_params.get('totalNum', 0))
|
||||||
|
self.authorization = json_params.get('authorization')
|
||||||
|
self.job_id = json_params.get('job_id')
|
||||||
|
except (json.JSONDecodeError, ValueError) as e:
|
||||||
|
self.logger.error(f"解析参数失败: {e}")
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
"""开始请求"""
|
||||||
|
yield SeleniumRequest(
|
||||||
|
url=self.BAIDU_URL,
|
||||||
|
callback=self.parse_parent,
|
||||||
|
wait_time=self.PAGE_LOAD_TIMEOUT
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_parent(self, response):
|
||||||
|
"""解析热点列表页面"""
|
||||||
|
driver = response.request.meta['driver']
|
||||||
|
|
||||||
|
# 获取热点数据
|
||||||
|
hot_items = self._fetch_hot_items()
|
||||||
|
|
||||||
|
for hot_item in hot_items[:self.MAX_HOT_ITEMS]:
|
||||||
|
if not hot_item.get('fake_url'):
|
||||||
|
self.logger.warning(f"热点 {hot_item['hot_word']} 无有效URL,跳过")
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield from self._process_hot_item(driver, hot_item)
|
||||||
|
|
||||||
|
def _fetch_hot_items(self) -> List[Dict[str, Any]]:
|
||||||
|
"""获取热点数据"""
|
||||||
|
try:
|
||||||
|
rsp_body = url_response(self.TOUTIAO_HOT_URL)
|
||||||
|
if rsp_body.get('status') != "success":
|
||||||
|
self.logger.error("获取热点数据失败")
|
||||||
|
return []
|
||||||
|
|
||||||
|
result_array = []
|
||||||
|
for line in rsp_body.get('data', []):
|
||||||
|
try:
|
||||||
|
result_array.append({
|
||||||
|
"hot_id": line.get('ClusterIdStr', ''),
|
||||||
|
"hot_word": line.get('Title', ''),
|
||||||
|
"hot_value": int(line.get('HotValue', 0)),
|
||||||
|
"fake_url": line.get('Url', '')
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"解析热点数据失败: {e}")
|
||||||
|
self.logger.debug(f"问题数据: {line}")
|
||||||
|
|
||||||
|
return result_array
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"获取热点数据异常: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _process_hot_item(self, driver, hot_item: Dict[str, Any]):
|
||||||
|
"""处理单个热点项"""
|
||||||
|
try:
|
||||||
|
# 加载热点页面
|
||||||
|
driver.get(hot_item['fake_url'])
|
||||||
|
self._wait_for_page_load(driver)
|
||||||
|
|
||||||
|
# 获取新闻卡片
|
||||||
|
news_cards = self._get_news_cards(driver)
|
||||||
|
news_urls_array = []
|
||||||
|
for card in news_cards:
|
||||||
|
news_url = self._extract_url_from_card(card)
|
||||||
|
if not news_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "video" in news_url.lower():
|
||||||
|
self.logger.info(f"跳过视频链接: {news_url}")
|
||||||
|
continue
|
||||||
|
news_urls_array.append(news_url)
|
||||||
|
|
||||||
|
for url in news_urls_array:
|
||||||
|
yield from self._process_news_page(driver, url, hot_item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"处理热点 '{hot_item['hot_word']}' 失败: {e}")
|
||||||
|
|
||||||
|
def _wait_for_page_load(self, driver, timeout: int = None):
|
||||||
|
"""等待页面加载"""
|
||||||
|
timeout = timeout or self.PAGE_LOAD_TIMEOUT
|
||||||
|
time.sleep(2) # 基础等待
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, timeout).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||||
|
)
|
||||||
|
except TimeoutException:
|
||||||
|
self.logger.warning("页面加载超时")
|
||||||
|
|
||||||
|
def _get_news_cards(self, driver) -> List[WebElement]:
|
||||||
|
"""获取新闻卡片列表"""
|
||||||
|
try:
|
||||||
|
cards = driver.find_elements(By.CSS_SELECTOR, ".feed-card-wrapper")
|
||||||
|
return cards[:self.MAX_NEWS_PER_HOT]
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"获取新闻卡片失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _extract_url_from_card(self, card: WebElement) -> Optional[str]:
|
||||||
|
"""从卡片中提取URL"""
|
||||||
|
for selector in self.URL_SELECTORS:
|
||||||
|
try:
|
||||||
|
element = card.find_element(By.CSS_SELECTOR, selector)
|
||||||
|
url = element.get_attribute('href')
|
||||||
|
if url and url.startswith(('http://', 'https://')):
|
||||||
|
return url
|
||||||
|
except NoSuchElementException:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _process_news_page(self, driver, news_url: str, hot_item: Dict[str, Any]):
|
||||||
|
"""处理单个新闻页面"""
|
||||||
|
try:
|
||||||
|
driver.get(news_url)
|
||||||
|
self._wait_for_page_load(driver)
|
||||||
|
|
||||||
|
# 提取页面信息
|
||||||
|
author = self._extract_text(driver, self.AUTHOR_SELECTORS)
|
||||||
|
content = self._extract_content(driver)
|
||||||
|
url_time = self._extract_time(driver)
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
self.logger.warning(f"页面无有效内容: {news_url}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 创建item
|
||||||
|
even_details_item = MediaspidersItem()
|
||||||
|
even_details_item['es_carriertype'] = 'news'
|
||||||
|
even_details_item['es_srcname'] = 'https://www.toutiao.com/'
|
||||||
|
even_details_item['es_sitename'] = '今日头条'
|
||||||
|
even_details_item['es_sid'] = get_str_md5(news_url)
|
||||||
|
even_details_item['es_urltitle'] = author
|
||||||
|
even_details_item['es_authors'] = author
|
||||||
|
even_details_item['es_urlcontent'] = content
|
||||||
|
even_details_item['es_urltime'] = url_time
|
||||||
|
even_details_item['es_lasttime'] = url_time
|
||||||
|
even_details_item['es_urlname'] = news_url
|
||||||
|
even_details_item['es_hkey'] = hot_item['hot_id']
|
||||||
|
even_details_item['es_urltopic'] = hot_item['hot_word']
|
||||||
|
even_details_item['es_video'] = ''
|
||||||
|
|
||||||
|
yield even_details_item
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"处理新闻页面失败 {news_url}: {e}")
|
||||||
|
|
||||||
|
def _extract_text(self, context, selectors: List[str]) -> Optional[str]:
|
||||||
|
"""从上下文中提取可见元素的文本"""
|
||||||
|
for selector in selectors:
|
||||||
|
try:
|
||||||
|
elements = context.find_elements(By.XPATH, selector)
|
||||||
|
|
||||||
|
for elem in elements:
|
||||||
|
if elem.is_displayed():
|
||||||
|
text = elem.text.strip()
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"选择器 '{selector}' 未匹配: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_content(self, driver) -> str:
|
||||||
|
"""提取文章内容"""
|
||||||
|
try:
|
||||||
|
time.sleep(2) # 等待内容加载
|
||||||
|
|
||||||
|
content_lines = []
|
||||||
|
for selector in self.CONTENT_SELECTORS:
|
||||||
|
try:
|
||||||
|
paragraphs = driver.find_elements(By.XPATH, selector)
|
||||||
|
|
||||||
|
for p in paragraphs:
|
||||||
|
if selector == '.weitoutiao-html':
|
||||||
|
text = p[0].text.strip()
|
||||||
|
else:
|
||||||
|
text = p.text.strip()
|
||||||
|
|
||||||
|
if text != '':
|
||||||
|
content_lines.append(text)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"选择器 '{selector}' 提取失败: {e}")
|
||||||
|
|
||||||
|
return '\n'.join(content_lines) if content_lines else ""
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"提取内容失败: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _is_valid_content(self, text: str) -> bool:
|
||||||
|
"""验证内容是否有效"""
|
||||||
|
if not text or len(text) <= 10:
|
||||||
|
return False
|
||||||
|
return not any(pattern in text for pattern in self.SKIP_PATTERNS)
|
||||||
|
|
||||||
|
def _extract_time(self, driver) -> Optional[int]:
|
||||||
|
"""提取发布时间"""
|
||||||
|
time_text = self._extract_text(driver, self.TIME_SELECTORS)
|
||||||
|
if time_text:
|
||||||
|
try:
|
||||||
|
return get_time_stamp(time_text)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"时间转换失败: {time_text}, {e}")
|
||||||
|
return self.url_time
|
||||||
@ -1,7 +1,11 @@
|
|||||||
import scrapy
|
|
||||||
import json
|
import json
|
||||||
from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response
|
|
||||||
|
import redis
|
||||||
|
import scrapy
|
||||||
|
|
||||||
from MediaSpiders.items import MediaspidersItem
|
from MediaSpiders.items import MediaspidersItem
|
||||||
|
from MediaSpiders.utils.hot_search_json_parser import parse_weibo_response, parse_toutiao_response
|
||||||
|
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||||
|
|
||||||
|
|
||||||
class HotSearchSpider(scrapy.Spider):
|
class HotSearchSpider(scrapy.Spider):
|
||||||
@ -22,25 +26,51 @@ class HotSearchSpider(scrapy.Spider):
|
|||||||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
|
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
|
||||||
# 'MediaSpiders.pipelines.HotSearchSaveToMySQL': 300
|
# 'MediaSpiders.pipelines.HotSearchSaveToMySQL': 300
|
||||||
},
|
},
|
||||||
'SPIDER_MIDDLEWARES': {},
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||||
|
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||||
|
},
|
||||||
'DOWNLOADER_MIDDLEWARES': {},
|
'DOWNLOADER_MIDDLEWARES': {},
|
||||||
'BATCH_SAVE_SIZE': 50
|
'BATCH_SAVE_SIZE': 50
|
||||||
}
|
}
|
||||||
|
|
||||||
start_urls = [
|
start_urls = [
|
||||||
'https://weibo.com/ajax/side/hotSearch',
|
# 'https://weibo.com/ajax/side/hotSearch',
|
||||||
'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
|
'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc'
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, params=None, *args, **kwargs):
|
def __init__(self, params=None, *args, **kwargs):
|
||||||
super(HotSearchSpider, self).__init__(*args, **kwargs)
|
super(HotSearchSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.job_id = None
|
||||||
|
self.collected_items = []
|
||||||
|
self.redis_client = redis.Redis(
|
||||||
|
host=self.settings['REDIS_HOST'],
|
||||||
|
port=self.settings['REDIS_PORT'],
|
||||||
|
password=self.settings['REDIS_PWD']
|
||||||
|
)
|
||||||
if params:
|
if params:
|
||||||
|
try:
|
||||||
json_params = json.loads(params)
|
json_params = json.loads(params)
|
||||||
if 'job_id' in json_params:
|
if 'job_id' in json_params:
|
||||||
self.job_id = json_params['job_id']
|
self.job_id = json_params['job_id']
|
||||||
|
if 'max_items' in json_params:
|
||||||
|
self.max_items = int(json_params['max_items'])
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"解析参数失败: {str(e)}")
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
"""发起初始请求"""
|
||||||
|
self.logger.info(f"开始爬取热搜数据,任务ID: {self.job_id if self.job_id else 'N/A'}")
|
||||||
|
self.url_time = get_current_timestamp()
|
||||||
|
for url in self.start_urls:
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=url,
|
||||||
|
callback=self.parse
|
||||||
|
)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
result_array = []
|
result_array = []
|
||||||
|
try:
|
||||||
if 'weibo.com' in response.url:
|
if 'weibo.com' in response.url:
|
||||||
result_array = parse_weibo_response(response.text)
|
result_array = parse_weibo_response(response.text)
|
||||||
elif 'toutiao.com' in response.url:
|
elif 'toutiao.com' in response.url:
|
||||||
@ -60,4 +90,12 @@ class HotSearchSpider(scrapy.Spider):
|
|||||||
hot_search_item['es_urltime'] = line['onboard_time']
|
hot_search_item['es_urltime'] = line['onboard_time']
|
||||||
hot_search_item['es_lasttime'] = line['crawl_time']
|
hot_search_item['es_lasttime'] = line['crawl_time']
|
||||||
hot_search_item['es_urlname'] = line['fake_url']
|
hot_search_item['es_urlname'] = line['fake_url']
|
||||||
|
|
||||||
yield hot_search_item
|
yield hot_search_item
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.exception(f"解析异常: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -2,11 +2,13 @@ import json
|
|||||||
import uuid
|
import uuid
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
from MediaSpiders.utils.string_utils import get_str_md5
|
from MediaSpiders.utils.string_utils import get_str_md5
|
||||||
|
|
||||||
|
|
||||||
def parse_weibo_response(rsp_str):
|
def parse_weibo_response(rsp_body):
|
||||||
rsp_body = json.loads(rsp_str)
|
|
||||||
result_array = []
|
result_array = []
|
||||||
if rsp_body['ok'] == 1:
|
if rsp_body['ok'] == 1:
|
||||||
realtime_data = rsp_body['data']['realtime']
|
realtime_data = rsp_body['data']['realtime']
|
||||||
@ -56,7 +58,7 @@ def parse_toutiao_response(rsp_str):
|
|||||||
"platform": "今日头条",
|
"platform": "今日头条",
|
||||||
"onboard_time": current_timestamp,
|
"onboard_time": current_timestamp,
|
||||||
"crawl_time": current_timestamp,
|
"crawl_time": current_timestamp,
|
||||||
"fake_url": f"https://www.toutiao.com/hot-event/hot-board/{custom_sid}"
|
"fake_url": line['Url']
|
||||||
}
|
}
|
||||||
if 'InterestCategory' in line:
|
if 'InterestCategory' in line:
|
||||||
result_line['category'] = ",".join(line['InterestCategory'])
|
result_line['category'] = ",".join(line['InterestCategory'])
|
||||||
@ -66,6 +68,11 @@ def parse_toutiao_response(rsp_str):
|
|||||||
logging.info(json.dumps(line, ensure_ascii=False))
|
logging.info(json.dumps(line, ensure_ascii=False))
|
||||||
return result_array
|
return result_array
|
||||||
|
|
||||||
|
def url_response(url):
|
||||||
|
rsp_str = requests.get(url).text
|
||||||
|
return json.loads(rsp_str)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# rsp_file = open("./toutiao_hot_search.json", 'r', encoding='utf-8')
|
# rsp_file = open("./toutiao_hot_search.json", 'r', encoding='utf-8')
|
||||||
|
|||||||
@ -1,10 +1,67 @@
|
|||||||
import datetime
|
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
import re
|
||||||
|
|
||||||
def get_current_timestamp():
|
def get_current_timestamp():
|
||||||
return int(time.time() * 1000)
|
return int(time.time() * 1000)
|
||||||
|
|
||||||
|
def str_to_timestamp(dt_str: str, tz_offset: int = 8) -> int:
|
||||||
|
"""
|
||||||
|
将时间字符串转为 Unix 时间戳(秒)
|
||||||
|
|
||||||
|
支持格式:
|
||||||
|
- 'YYYY-MM-DD HH:MM'
|
||||||
|
- 'YYYY-MM-DD HH:MM:SS'
|
||||||
|
- 以及包含额外文本的混合字符串(如:"2026-02-27 20:11·头条新锐创作者")
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dt_str: 时间字符串(会自动提取其中的时间部分)
|
||||||
|
tz_offset: 时区偏移(小时),中国用 8
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
整数时间戳
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: 无法提取有效时间格式时抛出
|
||||||
|
"""
|
||||||
|
# 去除首尾空格
|
||||||
|
dt_str = dt_str.strip()
|
||||||
|
|
||||||
|
# 使用正则表达式提取时间部分(匹配 YYYY-MM-DD HH:MM 或 YYYY-MM-DD HH:MM:SS)
|
||||||
|
time_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}(?::\d{2})?)'
|
||||||
|
match = re.search(time_pattern, dt_str)
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
raise ValueError(f"无法从字符串中提取有效时间格式: {dt_str}")
|
||||||
|
|
||||||
|
# 获取匹配到的时间字符串
|
||||||
|
time_str = match.group(1)
|
||||||
|
|
||||||
|
# 根据格式解析
|
||||||
|
try:
|
||||||
|
if len(time_str) == 16: # 'YYYY-MM-DD HH:MM'
|
||||||
|
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
|
||||||
|
elif len(time_str) == 19: # 'YYYY-MM-DD HH:MM:SS'
|
||||||
|
dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
|
||||||
|
else:
|
||||||
|
# 尝试自动解析
|
||||||
|
for fmt in ['%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']:
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(time_str, fmt)
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise ValueError(f"无法解析的时间格式: {time_str}")
|
||||||
|
except ValueError as e:
|
||||||
|
raise ValueError(f"时间格式解析失败: {time_str}") from e
|
||||||
|
|
||||||
|
# 创建时区
|
||||||
|
tz = timezone(timedelta(hours=tz_offset))
|
||||||
|
|
||||||
|
# 返回时间戳
|
||||||
|
return int(dt.replace(tzinfo=tz).timestamp())
|
||||||
|
|
||||||
|
|
||||||
def get_time_stamp(date_str):
|
def get_time_stamp(date_str):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -20,4 +20,4 @@ dirpath = os.path.dirname(os.path.abspath(__file__))
|
|||||||
sys.path.append(dirpath)
|
sys.path.append(dirpath)
|
||||||
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
# 等效于:scrapy crawl FacebookUserSpider -a params="{}"
|
||||||
# execute(['scrapy', 'crawl', 'hot_search_spider', '-a', 'params={}'])
|
# execute(['scrapy', 'crawl', 'hot_search_spider', '-a', 'params={}'])
|
||||||
execute(['scrapy', 'crawl', 'BaiduHotSearchSprder', '-a', 'params={}'])
|
execute(['scrapy', 'crawl', 'HotSearchRedisSpider', '-a', 'params={}'])
|
||||||
Loading…
x
Reference in New Issue
Block a user