[linkedin] 用户基本信息采集
This commit is contained in:
parent
a375b8ead1
commit
d023703622
@ -61,6 +61,7 @@ class SeleniumMiddleware:
|
|||||||
edge_options.use_chromium = True
|
edge_options.use_chromium = True
|
||||||
# edge_options.add_argument("--headless")
|
# edge_options.add_argument("--headless")
|
||||||
# 隐藏“受自动化软件控制”提示栏
|
# 隐藏“受自动化软件控制”提示栏
|
||||||
|
edge_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
# 禁用自动化扩展
|
# 禁用自动化扩展
|
||||||
edge_options.add_experimental_option('useAutomationExtension', False)
|
edge_options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
|||||||
@ -50,6 +50,7 @@ BATCH_SAVE_SIZE = 5
|
|||||||
|
|
||||||
TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter'
|
TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter'
|
||||||
FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter'
|
FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter'
|
||||||
|
LINKEDIN_FILTER_KEY = 'URL_Filter:MediaSpiders:Linkedin_Filter'
|
||||||
YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter'
|
YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter'
|
||||||
WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter'
|
WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter'
|
||||||
WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter'
|
WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter'
|
||||||
@ -57,6 +58,7 @@ FLICKR_FILTER_KEY = 'URL_Filter:MediaSpiders:Flickr_Filter'
|
|||||||
|
|
||||||
TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter'
|
TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter'
|
||||||
FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter'
|
FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter'
|
||||||
|
LINKEDIN_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Linkedin_Filter'
|
||||||
YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter'
|
YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter'
|
||||||
WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter'
|
WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter'
|
||||||
WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter'
|
WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter'
|
||||||
|
|||||||
408
spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py
Normal file
408
spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py
Normal file
@ -0,0 +1,408 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# 标准库
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import json
|
||||||
|
import logging as logger
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
import redis
|
||||||
|
import scrapy
|
||||||
|
from redisbloom.client import Client
|
||||||
|
from scrapy_selenium import SeleniumRequest
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
|
from MediaSpiders.items import MediaspidersItem
|
||||||
|
from MediaSpiders.utils.string_utils import get_str_md5
|
||||||
|
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||||
|
|
||||||
|
|
||||||
|
class LinkedinSpider(scrapy.Spider):
|
||||||
|
name = 'LinkedinUserSpider'
|
||||||
|
comment_urls = []
|
||||||
|
custom_settings = {
|
||||||
|
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||||||
|
'PROTO_CLASS_NAME': 'EsSets',
|
||||||
|
'PROTO_FIELD_NAME': 'Es',
|
||||||
|
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||||||
|
'IMAGES_STORE': r'/usr/local/temp_image/linkedin',
|
||||||
|
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||||
|
'FILES_STORE': r'/usr/local/videos',
|
||||||
|
'FILES_RESULT_FIELD': 'es_video',
|
||||||
|
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||||||
|
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||||
|
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||||
|
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
|
||||||
|
},
|
||||||
|
'DOWNLOAD_DELAY': 2,
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||||
|
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
||||||
|
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, params=None, *args, **kwargs):
|
||||||
|
super(LinkedinSpider, self).__init__(*args, **kwargs)
|
||||||
|
json_params = json.loads(params)
|
||||||
|
logger.info(json_params)
|
||||||
|
self.crawl_comment = False
|
||||||
|
self.redis_client = None
|
||||||
|
self.bloom_filter = None
|
||||||
|
self.simhash_filter_key = None
|
||||||
|
if 'job_id' in json_params:
|
||||||
|
self.job_id = json_params['job_id']
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||||
|
password=self.settings['REDIS_PWD'])
|
||||||
|
self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||||
|
password=self.settings['REDIS_PWD'])
|
||||||
|
self.simhash_filter_key = self.settings['LINKEDIN_SIMHASH_FILTER_KEY']
|
||||||
|
yield SeleniumRequest(url='https://www.google.com/', callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
logger.info("login linkedin")
|
||||||
|
driver = response.request.meta['driver']
|
||||||
|
driver.maximize_window()
|
||||||
|
# 访问主域,再设 Cookie
|
||||||
|
driver.get("https://www.linkedin.com/")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# 添加 Cookie(确保 domain 是 .linkedin.com)
|
||||||
|
cookie_string = self.redis_client.get("MediaSpiders:Linkedin_Cookies").decode()
|
||||||
|
cookie_dict = self.form_cookie_dict(cookie_string) # 你已有此函数
|
||||||
|
|
||||||
|
# 转换为 Selenium 所需格式(必须含 domain 和 path)
|
||||||
|
cookies_to_add = []
|
||||||
|
for name, value in cookie_dict.items():
|
||||||
|
cookies_to_add.append({
|
||||||
|
'name': name,
|
||||||
|
'value': value,
|
||||||
|
'domain': '.linkedin.com',
|
||||||
|
'path': '/',
|
||||||
|
'secure': True
|
||||||
|
})
|
||||||
|
|
||||||
|
for cookie in cookies_to_add:
|
||||||
|
try:
|
||||||
|
driver.add_cookie(cookie)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
|
||||||
|
|
||||||
|
driver.refresh()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# 获取待采集目标账号,并逐个请求
|
||||||
|
# account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
||||||
|
# account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
|
||||||
|
# post_data = {
|
||||||
|
# 'userType': self.settings['FACEBOOK_USER_TYPE'],
|
||||||
|
# 'userFlag': 0
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# account_rsp = json.loads(
|
||||||
|
# http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||||||
|
# all_user_info = []
|
||||||
|
|
||||||
|
# if account_rsp['code'] == 200:
|
||||||
|
# all_user_info = account_rsp['content']
|
||||||
|
# logger.info('GET %s users' % account_rsp['message'])
|
||||||
|
# driver.set_window_size(1920, 1080)
|
||||||
|
|
||||||
|
all_user_info = [
|
||||||
|
{'id': 87, 'userFlag': '0', 'userName': 'andrewyng', 'userType': '2', 'userUid': 'USForcesJapan.J'}]
|
||||||
|
for user_info in all_user_info:
|
||||||
|
user_name = user_info['userName']
|
||||||
|
# 修复2: 移除 URL 末尾空格
|
||||||
|
current_url = f'https://www.linkedin.com/in/{user_name}/recent-activity/all/'
|
||||||
|
driver.get(current_url)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# 修复3: 智能滚动加载(替代固定坐标点击)
|
||||||
|
self.smart_scroll(driver, max_scrolls=5)
|
||||||
|
|
||||||
|
# ✅ 修复 XPath:使用现代 LinkedIn 动态卡片定位方式
|
||||||
|
current_page_articles = driver.find_elements(
|
||||||
|
By.XPATH,
|
||||||
|
"//div[contains(@class, 'feed-shared-update-v2')]"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Found {len(current_page_articles)} articles for {user_name}")
|
||||||
|
items = self.get_linkedin_articles(current_page_articles, user_name, user_info['userUid'])
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if item.get('es_commentcount', 0) > 0:
|
||||||
|
self.comment_urls.append({
|
||||||
|
'url': item['es_urlname'],
|
||||||
|
'article_id': item['es_sid'],
|
||||||
|
'article_author': item['es_authors'],
|
||||||
|
'article_text': item['es_urlcontent']
|
||||||
|
})
|
||||||
|
logger.info(f"用户 {item['es_userid']} 发文: {item['es_urlcontent'][:50]}...")
|
||||||
|
yield item
|
||||||
|
|
||||||
|
# 评论爬取需单独设计(LinkedIn 评论需点击展开)
|
||||||
|
if self.crawl_comment and self.comment_urls:
|
||||||
|
comment_url = self.comment_urls.pop()
|
||||||
|
yield SeleniumRequest(
|
||||||
|
url=comment_url['url'],
|
||||||
|
callback=self.linkedin_comment_parse,
|
||||||
|
meta={'article_id': comment_url['article_id'], 'driver': driver}
|
||||||
|
)
|
||||||
|
|
||||||
|
def smart_scroll(self, driver, max_scrolls=5):
|
||||||
|
"""智能滚动:检测内容增量加载"""
|
||||||
|
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||||
|
for i in range(max_scrolls):
|
||||||
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
time.sleep(3) # 等待动态加载
|
||||||
|
|
||||||
|
# 检查是否加载了新内容
|
||||||
|
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||||
|
if new_height == last_height:
|
||||||
|
logger.info(f"滚动条 {i + 1}:未加载新内容,停止")
|
||||||
|
break
|
||||||
|
last_height = new_height
|
||||||
|
logger.info(f"滚动条 {i + 1}:加载了新内容到高度 {new_height}")
|
||||||
|
|
||||||
|
def get_linkedin_articles(self, articles, Uname, uid):
|
||||||
|
article_items = []
|
||||||
|
for idx, article in enumerate(articles):
|
||||||
|
try:
|
||||||
|
|
||||||
|
# === 1. 作者姓名 ===
|
||||||
|
try:
|
||||||
|
author_elem = article.find_element(By.XPATH,
|
||||||
|
".//span[contains(@class, 'update-components-actor__title')]//span[@aria-hidden='true']")
|
||||||
|
uname = author_elem.text.strip()
|
||||||
|
except:
|
||||||
|
uname = Uname
|
||||||
|
|
||||||
|
# === 2. 发布时间(相对时间转绝对时间戳)===
|
||||||
|
try:
|
||||||
|
time_elem = article.find_element(By.XPATH,
|
||||||
|
".//span[contains(@class, 'update-components-actor__sub-description')]")
|
||||||
|
relative_time = time_elem.text.split('•')[0].strip() # "1 个月前"
|
||||||
|
article_time = self.parse_linkedin_relative_time(relative_time)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Time parse failed: {e}")
|
||||||
|
article_time = get_current_timestamp() - 86400000 # 默认24小时前
|
||||||
|
|
||||||
|
# === 3. 正文内容(处理"展开"按钮)===
|
||||||
|
try:
|
||||||
|
|
||||||
|
# 提取正文(多段落合并)
|
||||||
|
content_parts = article.find_elements(By.XPATH,
|
||||||
|
".//div[contains(@class, 'update-components-text')]//span[@dir='ltr']")
|
||||||
|
article_text = " ".join([p.text for p in content_parts if p.text.strip()])
|
||||||
|
except:
|
||||||
|
article_text = ""
|
||||||
|
|
||||||
|
# === 4. 文章链接(从 actor 链接提取)===
|
||||||
|
try:
|
||||||
|
# 获取文章链接
|
||||||
|
activity_urn = article.get_attribute("data-urn")
|
||||||
|
url_name = f"https://www.linkedin.com/feed/update/{activity_urn}"
|
||||||
|
except:
|
||||||
|
article_url = f"https://www.linkedin.com/in/{uname}/"
|
||||||
|
|
||||||
|
# === 5. 图片提取 ===
|
||||||
|
img_urls = []
|
||||||
|
try:
|
||||||
|
img_urls = [
|
||||||
|
img.get_attribute('data-delayed-url').strip()
|
||||||
|
for img in
|
||||||
|
article.find_elements(By.XPATH, ".//img[contains(@class, 'update-components-image__image')]")
|
||||||
|
if img.get_attribute('data-delayed-url')
|
||||||
|
]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# === 6. 互动数据(从 aria-label 提取)===
|
||||||
|
like_count = comment_count = forward_count = 0
|
||||||
|
try:
|
||||||
|
# 点赞数
|
||||||
|
like_btn = article.find_element(By.XPATH,
|
||||||
|
".//span[contains(@class, 'social-details-social-counts')]").text
|
||||||
|
like_count = self.extract_number(like_btn)
|
||||||
|
|
||||||
|
# 评论数
|
||||||
|
comment_btn = article.find_element(By.XPATH, ".//button[contains(@aria-label, '评论')]").text
|
||||||
|
comment_count = self.extract_number(comment_btn)
|
||||||
|
|
||||||
|
# 转发数
|
||||||
|
repost_btn = article.find_element(By.XPATH,
|
||||||
|
".//button[contains(@aria-label, '转发')]").text
|
||||||
|
forward_count = self.extract_number(repost_btn)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Interaction count parse failed: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# == 7 获取是否年度影响人物(linkedin特有标识),定位到包含 "• 3 度+" 的 span
|
||||||
|
degree_span = article.find_element(
|
||||||
|
By.XPATH,
|
||||||
|
"//span[@aria-hidden='true' and contains(., '•') and contains(., '度+')]"
|
||||||
|
)
|
||||||
|
degree_text = degree_span.text.strip()
|
||||||
|
except Exception as e:
|
||||||
|
degree_text = ""
|
||||||
|
|
||||||
|
es_content = article_text.replace('[Original text:]', '').strip()
|
||||||
|
|
||||||
|
# === 7. 构建 Item ===
|
||||||
|
article_id = get_str_md5(f"{uname}{article_text}{article_time}")
|
||||||
|
item = MediaspidersItem()
|
||||||
|
item['es_sid'] = article_id
|
||||||
|
item['es_hkey'] = article_id
|
||||||
|
item['es_content'] = es_content
|
||||||
|
item['es_urlcontent'] = es_content
|
||||||
|
item['es_urltime'] = article_time # 注意:如果你之前已转为字符串时间,这里就是 str;否则是时间戳
|
||||||
|
item['es_lasttime'] = get_current_timestamp()
|
||||||
|
item['es_loadtime'] = get_current_timestamp()
|
||||||
|
item['es_urltitle'] = uname
|
||||||
|
item['es_authors'] = uname
|
||||||
|
item['es_userid'] = uid
|
||||||
|
item['image_urls'] = img_urls
|
||||||
|
item['file_urls'] = []
|
||||||
|
item['es_urlname'] = url_name
|
||||||
|
item['es_commentcount'] = comment_count
|
||||||
|
item['es_forwardcount'] = forward_count
|
||||||
|
item['es_likecount'] = like_count
|
||||||
|
item['es_sitename'] = 'linkedin'
|
||||||
|
item['es_srcname'] = 'linkedin'
|
||||||
|
item['es_carriertype'] = 'media'
|
||||||
|
item['es_heat'] = degree_text
|
||||||
|
|
||||||
|
# 判重逻辑
|
||||||
|
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
|
||||||
|
if self.bloom_filter.bfAdd(self.settings['LINKEDIN_FILTER_KEY'], article_id) <= 0:
|
||||||
|
logger.info(f"跳过已采集内容: {article_id[:10]}...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not item['es_urlcontent']:
|
||||||
|
logger.warning("跳过空内容动态")
|
||||||
|
continue
|
||||||
|
|
||||||
|
article_items.append(item)
|
||||||
|
logger.debug(f"Article {idx}: {uname} - {article_text[:30]}...")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"解析动态失败 (index {idx}): {repr(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"用户 {uid} 共采集 {len(article_items)} 条有效动态")
|
||||||
|
return article_items
|
||||||
|
|
||||||
|
def parse_linkedin_relative_time(self, text):
|
||||||
|
"""将"1 个月前"等相对时间转为时间戳"""
|
||||||
|
now = datetime.now()
|
||||||
|
text = text.lower().replace(' ', '')
|
||||||
|
|
||||||
|
if '秒前' in text or 'secondsago' in text:
|
||||||
|
seconds = int(re.search(r'\d+', text).group())
|
||||||
|
return int((now - timedelta(seconds=seconds)).timestamp() * 1000)
|
||||||
|
elif '分钟前' in text or 'minutesago' in text:
|
||||||
|
minutes = int(re.search(r'\d+', text).group())
|
||||||
|
return int((now - timedelta(minutes=minutes)).timestamp() * 1000)
|
||||||
|
elif '小时前' in text or 'hoursago' in text:
|
||||||
|
hours = int(re.search(r'\d+', text).group())
|
||||||
|
return int((now - timedelta(hours=hours)).timestamp() * 1000)
|
||||||
|
elif '天前' in text or 'daysago' in text:
|
||||||
|
days = int(re.search(r'\d+', text).group())
|
||||||
|
return int((now - timedelta(days=days)).timestamp() * 1000)
|
||||||
|
elif '周前' in text or 'weeksago' in text:
|
||||||
|
weeks = int(re.search(r'\d+', text).group())
|
||||||
|
return int((now - timedelta(weeks=weeks)).timestamp() * 1000)
|
||||||
|
elif '月前' in text or 'monthsago' in text:
|
||||||
|
months = int(re.search(r'\d+', text).group())
|
||||||
|
# 简化处理:1个月≈30天
|
||||||
|
return int((now - timedelta(days=months * 30)).timestamp() * 1000)
|
||||||
|
elif '年前' in text or 'yearsago' in text:
|
||||||
|
years = int(re.search(r'\d+', text).group())
|
||||||
|
return int((now - timedelta(days=years * 365)).timestamp() * 1000)
|
||||||
|
else:
|
||||||
|
return get_current_timestamp() - 86400000 # 默认24小时前
|
||||||
|
|
||||||
|
def extract_number(self, text):
|
||||||
|
"""从"1,234 个赞"提取数字 1234"""
|
||||||
|
try:
|
||||||
|
num_str = re.search(r'[\d,]+', text).group().replace(',', '')
|
||||||
|
return int(num_str)
|
||||||
|
except:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def linkedin_comment_parse(self, response):
|
||||||
|
driver = response.meta['driver']
|
||||||
|
article_id = response.meta['article_id']
|
||||||
|
|
||||||
|
# 点击"评论"按钮展开评论区
|
||||||
|
try:
|
||||||
|
comment_btn = driver.find_element(By.XPATH,
|
||||||
|
"//button[contains(@class, 'comments-comment-button')]")
|
||||||
|
comment_btn.click()
|
||||||
|
time.sleep(3)
|
||||||
|
except:
|
||||||
|
logger.warning("未找到评论按钮,跳过评论爬取")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 滚动加载评论
|
||||||
|
self.smart_scroll(driver, max_scrolls=3)
|
||||||
|
|
||||||
|
# 提取评论
|
||||||
|
comment_elements = driver.find_elements(By.XPATH,
|
||||||
|
"//div[contains(@class, 'comments-comment-item')]")
|
||||||
|
|
||||||
|
for comment in comment_elements:
|
||||||
|
try:
|
||||||
|
author = comment.find_element(By.XPATH,
|
||||||
|
".//span[contains(@class, 'comments-post-meta__name-text')]").text.strip()
|
||||||
|
content = comment.find_element(By.XPATH,
|
||||||
|
".//span[contains(@class, 'comments-comment-item-content')]").text.strip()
|
||||||
|
comment_id = get_str_md5(f"{author}{content}")
|
||||||
|
|
||||||
|
item = MediaspidersItem()
|
||||||
|
item['es_sid'] = comment_id
|
||||||
|
item['es_hkey'] = article_id
|
||||||
|
item['es_content'] = content
|
||||||
|
item['es_authors'] = author
|
||||||
|
item['es_userid'] = author
|
||||||
|
item['es_urltime'] = get_current_timestamp()
|
||||||
|
item['es_sitename'] = 'linkedin'
|
||||||
|
item['es_srcname'] = 'linkedin_comment'
|
||||||
|
item['es_carriertype'] = 'comment'
|
||||||
|
yield item
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 继续处理队列中的其他评论
|
||||||
|
if self.comment_urls:
|
||||||
|
next_comment = self.comment_urls.pop()
|
||||||
|
yield SeleniumRequest(
|
||||||
|
url=next_comment['url'],
|
||||||
|
callback=self.linkedin_comment_parse,
|
||||||
|
meta={'article_id': next_comment['article_id'], 'driver': driver}
|
||||||
|
)
|
||||||
|
|
||||||
|
def form_cookie_dict(self, cookie_str: str) -> dict:
|
||||||
|
# 清理前缀(兼容中英文冒号)
|
||||||
|
for prefix in ["Cookie:", "Cookie:"]:
|
||||||
|
if cookie_str.startswith(prefix):
|
||||||
|
cookie_str = cookie_str[len(prefix):].strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
cookie_dict = {}
|
||||||
|
for item in cookie_str.split(';'):
|
||||||
|
item = item.strip()
|
||||||
|
if not item or '=' not in item:
|
||||||
|
continue
|
||||||
|
name, value = item.split('=', 1) # 仅分割第一个等号
|
||||||
|
name, value = name.strip(), value.strip()
|
||||||
|
# 移除 value 两端双引号(Selenium 不需要)
|
||||||
|
if value.startswith('"') and value.endswith('"'):
|
||||||
|
value = value[1:-1]
|
||||||
|
cookie_dict[name] = value
|
||||||
|
return cookie_dict
|
||||||
Loading…
x
Reference in New Issue
Block a user