[linkedin] 用户基本信息采集

This commit is contained in:
DELL 2026-01-28 11:00:03 +08:00
parent a375b8ead1
commit d023703622
3 changed files with 411 additions and 0 deletions

View File

@ -61,6 +61,7 @@ class SeleniumMiddleware:
edge_options.use_chromium = True edge_options.use_chromium = True
# edge_options.add_argument("--headless") # edge_options.add_argument("--headless")
# 隐藏“受自动化软件控制”提示栏 # 隐藏“受自动化软件控制”提示栏
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"]) edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
# 禁用自动化扩展 # 禁用自动化扩展
edge_options.add_experimental_option('useAutomationExtension', False) edge_options.add_experimental_option('useAutomationExtension', False)

View File

@ -50,6 +50,7 @@ BATCH_SAVE_SIZE = 5
TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter' TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter'
FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter' FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter'
LINKEDIN_FILTER_KEY = 'URL_Filter:MediaSpiders:Linkedin_Filter'
YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter' YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter'
WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter' WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter'
WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter' WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter'
@ -57,6 +58,7 @@ FLICKR_FILTER_KEY = 'URL_Filter:MediaSpiders:Flickr_Filter'
TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter' TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter'
FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter' FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter'
LINKEDIN_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Linkedin_Filter'
YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter' YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter'
WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter' WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter'
WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter' WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter'

View File

@ -0,0 +1,408 @@
# -*- coding: utf-8 -*-
# 标准库
from datetime import datetime, timedelta
import json
import logging as logger
import re
import time
import hashlib
import redis
import scrapy
from redisbloom.client import Client
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.string_utils import get_str_md5
from MediaSpiders.utils.time_utils import get_current_timestamp
class LinkedinSpider(scrapy.Spider):
name = 'LinkedinUserSpider'
comment_urls = []
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/linkedin',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1,
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
},
'DOWNLOAD_DELAY': 2,
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
}
}
def __init__(self, params=None, *args, **kwargs):
super(LinkedinSpider, self).__init__(*args, **kwargs)
json_params = json.loads(params)
logger.info(json_params)
self.crawl_comment = False
self.redis_client = None
self.bloom_filter = None
self.simhash_filter_key = None
if 'job_id' in json_params:
self.job_id = json_params['job_id']
def start_requests(self):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['LINKEDIN_SIMHASH_FILTER_KEY']
yield SeleniumRequest(url='https://www.google.com/', callback=self.parse)
def parse(self, response):
logger.info("login linkedin")
driver = response.request.meta['driver']
driver.maximize_window()
# 访问主域,再设 Cookie
driver.get("https://www.linkedin.com/")
time.sleep(2)
# 添加 Cookie确保 domain 是 .linkedin.com
cookie_string = self.redis_client.get("MediaSpiders:Linkedin_Cookies").decode()
cookie_dict = self.form_cookie_dict(cookie_string) # 你已有此函数
# 转换为 Selenium 所需格式(必须含 domain 和 path
cookies_to_add = []
for name, value in cookie_dict.items():
cookies_to_add.append({
'name': name,
'value': value,
'domain': '.linkedin.com',
'path': '/',
'secure': True
})
for cookie in cookies_to_add:
try:
driver.add_cookie(cookie)
except Exception as e:
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
driver.refresh()
time.sleep(5)
# 获取待采集目标账号,并逐个请求
# account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
# account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
# post_data = {
# 'userType': self.settings['FACEBOOK_USER_TYPE'],
# 'userFlag': 0
# }
#
# account_rsp = json.loads(
# http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
# all_user_info = []
# if account_rsp['code'] == 200:
# all_user_info = account_rsp['content']
# logger.info('GET %s users' % account_rsp['message'])
# driver.set_window_size(1920, 1080)
all_user_info = [
{'id': 87, 'userFlag': '0', 'userName': 'andrewyng', 'userType': '2', 'userUid': 'USForcesJapan.J'}]
for user_info in all_user_info:
user_name = user_info['userName']
# 修复2: 移除 URL 末尾空格
current_url = f'https://www.linkedin.com/in/{user_name}/recent-activity/all/'
driver.get(current_url)
time.sleep(5)
# 修复3: 智能滚动加载(替代固定坐标点击)
self.smart_scroll(driver, max_scrolls=5)
# ✅ 修复 XPath使用现代 LinkedIn 动态卡片定位方式
current_page_articles = driver.find_elements(
By.XPATH,
"//div[contains(@class, 'feed-shared-update-v2')]"
)
logger.info(f"Found {len(current_page_articles)} articles for {user_name}")
items = self.get_linkedin_articles(current_page_articles, user_name, user_info['userUid'])
for item in items:
if item.get('es_commentcount', 0) > 0:
self.comment_urls.append({
'url': item['es_urlname'],
'article_id': item['es_sid'],
'article_author': item['es_authors'],
'article_text': item['es_urlcontent']
})
logger.info(f"用户 {item['es_userid']} 发文: {item['es_urlcontent'][:50]}...")
yield item
# 评论爬取需单独设计LinkedIn 评论需点击展开)
if self.crawl_comment and self.comment_urls:
comment_url = self.comment_urls.pop()
yield SeleniumRequest(
url=comment_url['url'],
callback=self.linkedin_comment_parse,
meta={'article_id': comment_url['article_id'], 'driver': driver}
)
def smart_scroll(self, driver, max_scrolls=5):
"""智能滚动:检测内容增量加载"""
last_height = driver.execute_script("return document.body.scrollHeight")
for i in range(max_scrolls):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # 等待动态加载
# 检查是否加载了新内容
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
logger.info(f"滚动条 {i + 1}:未加载新内容,停止")
break
last_height = new_height
logger.info(f"滚动条 {i + 1}:加载了新内容到高度 {new_height}")
def get_linkedin_articles(self, articles, Uname, uid):
article_items = []
for idx, article in enumerate(articles):
try:
# === 1. 作者姓名 ===
try:
author_elem = article.find_element(By.XPATH,
".//span[contains(@class, 'update-components-actor__title')]//span[@aria-hidden='true']")
uname = author_elem.text.strip()
except:
uname = Uname
# === 2. 发布时间(相对时间转绝对时间戳)===
try:
time_elem = article.find_element(By.XPATH,
".//span[contains(@class, 'update-components-actor__sub-description')]")
relative_time = time_elem.text.split('')[0].strip() # "1 个月前"
article_time = self.parse_linkedin_relative_time(relative_time)
except Exception as e:
logger.warning(f"Time parse failed: {e}")
article_time = get_current_timestamp() - 86400000 # 默认24小时前
# === 3. 正文内容(处理"展开"按钮)===
try:
# 提取正文(多段落合并)
content_parts = article.find_elements(By.XPATH,
".//div[contains(@class, 'update-components-text')]//span[@dir='ltr']")
article_text = " ".join([p.text for p in content_parts if p.text.strip()])
except:
article_text = ""
# === 4. 文章链接(从 actor 链接提取)===
try:
# 获取文章链接
activity_urn = article.get_attribute("data-urn")
url_name = f"https://www.linkedin.com/feed/update/{activity_urn}"
except:
article_url = f"https://www.linkedin.com/in/{uname}/"
# === 5. 图片提取 ===
img_urls = []
try:
img_urls = [
img.get_attribute('data-delayed-url').strip()
for img in
article.find_elements(By.XPATH, ".//img[contains(@class, 'update-components-image__image')]")
if img.get_attribute('data-delayed-url')
]
except:
pass
# === 6. 互动数据(从 aria-label 提取)===
like_count = comment_count = forward_count = 0
try:
# 点赞数
like_btn = article.find_element(By.XPATH,
".//span[contains(@class, 'social-details-social-counts')]").text
like_count = self.extract_number(like_btn)
# 评论数
comment_btn = article.find_element(By.XPATH, ".//button[contains(@aria-label, '评论')]").text
comment_count = self.extract_number(comment_btn)
# 转发数
repost_btn = article.find_element(By.XPATH,
".//button[contains(@aria-label, '转发')]").text
forward_count = self.extract_number(repost_btn)
except Exception as e:
logger.debug(f"Interaction count parse failed: {e}")
try:
# == 7 获取是否年度影响人物linkedin特有标识,定位到包含 "• 3 度+" 的 span
degree_span = article.find_element(
By.XPATH,
"//span[@aria-hidden='true' and contains(., '') and contains(., '度+')]"
)
degree_text = degree_span.text.strip()
except Exception as e:
degree_text = ""
es_content = article_text.replace('[Original text:]', '').strip()
# === 7. 构建 Item ===
article_id = get_str_md5(f"{uname}{article_text}{article_time}")
item = MediaspidersItem()
item['es_sid'] = article_id
item['es_hkey'] = article_id
item['es_content'] = es_content
item['es_urlcontent'] = es_content
item['es_urltime'] = article_time # 注意:如果你之前已转为字符串时间,这里就是 str否则是时间戳
item['es_lasttime'] = get_current_timestamp()
item['es_loadtime'] = get_current_timestamp()
item['es_urltitle'] = uname
item['es_authors'] = uname
item['es_userid'] = uid
item['image_urls'] = img_urls
item['file_urls'] = []
item['es_urlname'] = url_name
item['es_commentcount'] = comment_count
item['es_forwardcount'] = forward_count
item['es_likecount'] = like_count
item['es_sitename'] = 'linkedin'
item['es_srcname'] = 'linkedin'
item['es_carriertype'] = 'media'
item['es_heat'] = degree_text
# 判重逻辑
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
if self.bloom_filter.bfAdd(self.settings['LINKEDIN_FILTER_KEY'], article_id) <= 0:
logger.info(f"跳过已采集内容: {article_id[:10]}...")
continue
if not item['es_urlcontent']:
logger.warning("跳过空内容动态")
continue
article_items.append(item)
logger.debug(f"Article {idx}: {uname} - {article_text[:30]}...")
except Exception as e:
logger.error(f"解析动态失败 (index {idx}): {repr(e)}")
continue
logger.info(f"用户 {uid} 共采集 {len(article_items)} 条有效动态")
return article_items
def parse_linkedin_relative_time(self, text):
""""1 个月前"等相对时间转为时间戳"""
now = datetime.now()
text = text.lower().replace(' ', '')
if '秒前' in text or 'secondsago' in text:
seconds = int(re.search(r'\d+', text).group())
return int((now - timedelta(seconds=seconds)).timestamp() * 1000)
elif '分钟前' in text or 'minutesago' in text:
minutes = int(re.search(r'\d+', text).group())
return int((now - timedelta(minutes=minutes)).timestamp() * 1000)
elif '小时前' in text or 'hoursago' in text:
hours = int(re.search(r'\d+', text).group())
return int((now - timedelta(hours=hours)).timestamp() * 1000)
elif '天前' in text or 'daysago' in text:
days = int(re.search(r'\d+', text).group())
return int((now - timedelta(days=days)).timestamp() * 1000)
elif '周前' in text or 'weeksago' in text:
weeks = int(re.search(r'\d+', text).group())
return int((now - timedelta(weeks=weeks)).timestamp() * 1000)
elif '月前' in text or 'monthsago' in text:
months = int(re.search(r'\d+', text).group())
# 简化处理1个月≈30天
return int((now - timedelta(days=months * 30)).timestamp() * 1000)
elif '年前' in text or 'yearsago' in text:
years = int(re.search(r'\d+', text).group())
return int((now - timedelta(days=years * 365)).timestamp() * 1000)
else:
return get_current_timestamp() - 86400000 # 默认24小时前
def extract_number(self, text):
""""1,234 个赞"提取数字 1234"""
try:
num_str = re.search(r'[\d,]+', text).group().replace(',', '')
return int(num_str)
except:
return 0
def linkedin_comment_parse(self, response):
driver = response.meta['driver']
article_id = response.meta['article_id']
# 点击"评论"按钮展开评论区
try:
comment_btn = driver.find_element(By.XPATH,
"//button[contains(@class, 'comments-comment-button')]")
comment_btn.click()
time.sleep(3)
except:
logger.warning("未找到评论按钮,跳过评论爬取")
return
# 滚动加载评论
self.smart_scroll(driver, max_scrolls=3)
# 提取评论
comment_elements = driver.find_elements(By.XPATH,
"//div[contains(@class, 'comments-comment-item')]")
for comment in comment_elements:
try:
author = comment.find_element(By.XPATH,
".//span[contains(@class, 'comments-post-meta__name-text')]").text.strip()
content = comment.find_element(By.XPATH,
".//span[contains(@class, 'comments-comment-item-content')]").text.strip()
comment_id = get_str_md5(f"{author}{content}")
item = MediaspidersItem()
item['es_sid'] = comment_id
item['es_hkey'] = article_id
item['es_content'] = content
item['es_authors'] = author
item['es_userid'] = author
item['es_urltime'] = get_current_timestamp()
item['es_sitename'] = 'linkedin'
item['es_srcname'] = 'linkedin_comment'
item['es_carriertype'] = 'comment'
yield item
except:
continue
# 继续处理队列中的其他评论
if self.comment_urls:
next_comment = self.comment_urls.pop()
yield SeleniumRequest(
url=next_comment['url'],
callback=self.linkedin_comment_parse,
meta={'article_id': next_comment['article_id'], 'driver': driver}
)
def form_cookie_dict(self, cookie_str: str) -> dict:
# 清理前缀(兼容中英文冒号)
for prefix in ["Cookie:", "Cookie"]:
if cookie_str.startswith(prefix):
cookie_str = cookie_str[len(prefix):].strip()
break
cookie_dict = {}
for item in cookie_str.split(';'):
item = item.strip()
if not item or '=' not in item:
continue
name, value = item.split('=', 1) # 仅分割第一个等号
name, value = name.strip(), value.strip()
# 移除 value 两端双引号Selenium 不需要)
if value.startswith('"') and value.endswith('"'):
value = value[1:-1]
cookie_dict[name] = value
return cookie_dict