[linkedin] 用户基本信息采集
This commit is contained in:
parent
a375b8ead1
commit
d023703622
@ -61,6 +61,7 @@ class SeleniumMiddleware:
|
||||
edge_options.use_chromium = True
|
||||
# edge_options.add_argument("--headless")
|
||||
# 隐藏“受自动化软件控制”提示栏
|
||||
edge_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
# 禁用自动化扩展
|
||||
edge_options.add_experimental_option('useAutomationExtension', False)
|
||||
|
||||
@ -50,6 +50,7 @@ BATCH_SAVE_SIZE = 5
|
||||
|
||||
TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter'
|
||||
FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter'
|
||||
LINKEDIN_FILTER_KEY = 'URL_Filter:MediaSpiders:Linkedin_Filter'
|
||||
YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter'
|
||||
WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter'
|
||||
WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter'
|
||||
@ -57,6 +58,7 @@ FLICKR_FILTER_KEY = 'URL_Filter:MediaSpiders:Flickr_Filter'
|
||||
|
||||
TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter'
|
||||
FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter'
|
||||
LINKEDIN_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Linkedin_Filter'
|
||||
YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter'
|
||||
WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter'
|
||||
WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter'
|
||||
|
||||
408
spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py
Normal file
408
spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py
Normal file
@ -0,0 +1,408 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# 标准库
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
import logging as logger
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
|
||||
import redis
|
||||
import scrapy
|
||||
from redisbloom.client import Client
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.utils.string_utils import get_str_md5
|
||||
from MediaSpiders.utils.time_utils import get_current_timestamp
|
||||
|
||||
|
||||
class LinkedinSpider(scrapy.Spider):
|
||||
name = 'LinkedinUserSpider'
|
||||
comment_urls = []
|
||||
custom_settings = {
|
||||
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||||
'PROTO_CLASS_NAME': 'EsSets',
|
||||
'PROTO_FIELD_NAME': 'Es',
|
||||
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||||
'IMAGES_STORE': r'/usr/local/temp_image/linkedin',
|
||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||
'FILES_STORE': r'/usr/local/videos',
|
||||
'FILES_RESULT_FIELD': 'es_video',
|
||||
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||
'ITEM_PIPELINES': {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
|
||||
},
|
||||
'DOWNLOAD_DELAY': 2,
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||||
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
||||
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, params=None, *args, **kwargs):
|
||||
super(LinkedinSpider, self).__init__(*args, **kwargs)
|
||||
json_params = json.loads(params)
|
||||
logger.info(json_params)
|
||||
self.crawl_comment = False
|
||||
self.redis_client = None
|
||||
self.bloom_filter = None
|
||||
self.simhash_filter_key = None
|
||||
if 'job_id' in json_params:
|
||||
self.job_id = json_params['job_id']
|
||||
|
||||
def start_requests(self):
|
||||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD'])
|
||||
self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD'])
|
||||
self.simhash_filter_key = self.settings['LINKEDIN_SIMHASH_FILTER_KEY']
|
||||
yield SeleniumRequest(url='https://www.google.com/', callback=self.parse)
|
||||
|
||||
def parse(self, response):
|
||||
logger.info("login linkedin")
|
||||
driver = response.request.meta['driver']
|
||||
driver.maximize_window()
|
||||
# 访问主域,再设 Cookie
|
||||
driver.get("https://www.linkedin.com/")
|
||||
time.sleep(2)
|
||||
|
||||
# 添加 Cookie(确保 domain 是 .linkedin.com)
|
||||
cookie_string = self.redis_client.get("MediaSpiders:Linkedin_Cookies").decode()
|
||||
cookie_dict = self.form_cookie_dict(cookie_string) # 你已有此函数
|
||||
|
||||
# 转换为 Selenium 所需格式(必须含 domain 和 path)
|
||||
cookies_to_add = []
|
||||
for name, value in cookie_dict.items():
|
||||
cookies_to_add.append({
|
||||
'name': name,
|
||||
'value': value,
|
||||
'domain': '.linkedin.com',
|
||||
'path': '/',
|
||||
'secure': True
|
||||
})
|
||||
|
||||
for cookie in cookies_to_add:
|
||||
try:
|
||||
driver.add_cookie(cookie)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
|
||||
|
||||
driver.refresh()
|
||||
time.sleep(5)
|
||||
|
||||
# 获取待采集目标账号,并逐个请求
|
||||
# account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
||||
# account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
|
||||
# post_data = {
|
||||
# 'userType': self.settings['FACEBOOK_USER_TYPE'],
|
||||
# 'userFlag': 0
|
||||
# }
|
||||
#
|
||||
# account_rsp = json.loads(
|
||||
# http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||||
# all_user_info = []
|
||||
|
||||
# if account_rsp['code'] == 200:
|
||||
# all_user_info = account_rsp['content']
|
||||
# logger.info('GET %s users' % account_rsp['message'])
|
||||
# driver.set_window_size(1920, 1080)
|
||||
|
||||
all_user_info = [
|
||||
{'id': 87, 'userFlag': '0', 'userName': 'andrewyng', 'userType': '2', 'userUid': 'USForcesJapan.J'}]
|
||||
for user_info in all_user_info:
|
||||
user_name = user_info['userName']
|
||||
# 修复2: 移除 URL 末尾空格
|
||||
current_url = f'https://www.linkedin.com/in/{user_name}/recent-activity/all/'
|
||||
driver.get(current_url)
|
||||
time.sleep(5)
|
||||
|
||||
# 修复3: 智能滚动加载(替代固定坐标点击)
|
||||
self.smart_scroll(driver, max_scrolls=5)
|
||||
|
||||
# ✅ 修复 XPath:使用现代 LinkedIn 动态卡片定位方式
|
||||
current_page_articles = driver.find_elements(
|
||||
By.XPATH,
|
||||
"//div[contains(@class, 'feed-shared-update-v2')]"
|
||||
)
|
||||
|
||||
logger.info(f"Found {len(current_page_articles)} articles for {user_name}")
|
||||
items = self.get_linkedin_articles(current_page_articles, user_name, user_info['userUid'])
|
||||
|
||||
for item in items:
|
||||
if item.get('es_commentcount', 0) > 0:
|
||||
self.comment_urls.append({
|
||||
'url': item['es_urlname'],
|
||||
'article_id': item['es_sid'],
|
||||
'article_author': item['es_authors'],
|
||||
'article_text': item['es_urlcontent']
|
||||
})
|
||||
logger.info(f"用户 {item['es_userid']} 发文: {item['es_urlcontent'][:50]}...")
|
||||
yield item
|
||||
|
||||
# 评论爬取需单独设计(LinkedIn 评论需点击展开)
|
||||
if self.crawl_comment and self.comment_urls:
|
||||
comment_url = self.comment_urls.pop()
|
||||
yield SeleniumRequest(
|
||||
url=comment_url['url'],
|
||||
callback=self.linkedin_comment_parse,
|
||||
meta={'article_id': comment_url['article_id'], 'driver': driver}
|
||||
)
|
||||
|
||||
def smart_scroll(self, driver, max_scrolls=5):
|
||||
"""智能滚动:检测内容增量加载"""
|
||||
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||
for i in range(max_scrolls):
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(3) # 等待动态加载
|
||||
|
||||
# 检查是否加载了新内容
|
||||
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||
if new_height == last_height:
|
||||
logger.info(f"滚动条 {i + 1}:未加载新内容,停止")
|
||||
break
|
||||
last_height = new_height
|
||||
logger.info(f"滚动条 {i + 1}:加载了新内容到高度 {new_height}")
|
||||
|
||||
def get_linkedin_articles(self, articles, Uname, uid):
|
||||
article_items = []
|
||||
for idx, article in enumerate(articles):
|
||||
try:
|
||||
|
||||
# === 1. 作者姓名 ===
|
||||
try:
|
||||
author_elem = article.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'update-components-actor__title')]//span[@aria-hidden='true']")
|
||||
uname = author_elem.text.strip()
|
||||
except:
|
||||
uname = Uname
|
||||
|
||||
# === 2. 发布时间(相对时间转绝对时间戳)===
|
||||
try:
|
||||
time_elem = article.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'update-components-actor__sub-description')]")
|
||||
relative_time = time_elem.text.split('•')[0].strip() # "1 个月前"
|
||||
article_time = self.parse_linkedin_relative_time(relative_time)
|
||||
except Exception as e:
|
||||
logger.warning(f"Time parse failed: {e}")
|
||||
article_time = get_current_timestamp() - 86400000 # 默认24小时前
|
||||
|
||||
# === 3. 正文内容(处理"展开"按钮)===
|
||||
try:
|
||||
|
||||
# 提取正文(多段落合并)
|
||||
content_parts = article.find_elements(By.XPATH,
|
||||
".//div[contains(@class, 'update-components-text')]//span[@dir='ltr']")
|
||||
article_text = " ".join([p.text for p in content_parts if p.text.strip()])
|
||||
except:
|
||||
article_text = ""
|
||||
|
||||
# === 4. 文章链接(从 actor 链接提取)===
|
||||
try:
|
||||
# 获取文章链接
|
||||
activity_urn = article.get_attribute("data-urn")
|
||||
url_name = f"https://www.linkedin.com/feed/update/{activity_urn}"
|
||||
except:
|
||||
article_url = f"https://www.linkedin.com/in/{uname}/"
|
||||
|
||||
# === 5. 图片提取 ===
|
||||
img_urls = []
|
||||
try:
|
||||
img_urls = [
|
||||
img.get_attribute('data-delayed-url').strip()
|
||||
for img in
|
||||
article.find_elements(By.XPATH, ".//img[contains(@class, 'update-components-image__image')]")
|
||||
if img.get_attribute('data-delayed-url')
|
||||
]
|
||||
except:
|
||||
pass
|
||||
|
||||
# === 6. 互动数据(从 aria-label 提取)===
|
||||
like_count = comment_count = forward_count = 0
|
||||
try:
|
||||
# 点赞数
|
||||
like_btn = article.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'social-details-social-counts')]").text
|
||||
like_count = self.extract_number(like_btn)
|
||||
|
||||
# 评论数
|
||||
comment_btn = article.find_element(By.XPATH, ".//button[contains(@aria-label, '评论')]").text
|
||||
comment_count = self.extract_number(comment_btn)
|
||||
|
||||
# 转发数
|
||||
repost_btn = article.find_element(By.XPATH,
|
||||
".//button[contains(@aria-label, '转发')]").text
|
||||
forward_count = self.extract_number(repost_btn)
|
||||
except Exception as e:
|
||||
logger.debug(f"Interaction count parse failed: {e}")
|
||||
|
||||
try:
|
||||
# == 7 获取是否年度影响人物(linkedin特有标识),定位到包含 "• 3 度+" 的 span
|
||||
degree_span = article.find_element(
|
||||
By.XPATH,
|
||||
"//span[@aria-hidden='true' and contains(., '•') and contains(., '度+')]"
|
||||
)
|
||||
degree_text = degree_span.text.strip()
|
||||
except Exception as e:
|
||||
degree_text = ""
|
||||
|
||||
es_content = article_text.replace('[Original text:]', '').strip()
|
||||
|
||||
# === 7. 构建 Item ===
|
||||
article_id = get_str_md5(f"{uname}{article_text}{article_time}")
|
||||
item = MediaspidersItem()
|
||||
item['es_sid'] = article_id
|
||||
item['es_hkey'] = article_id
|
||||
item['es_content'] = es_content
|
||||
item['es_urlcontent'] = es_content
|
||||
item['es_urltime'] = article_time # 注意:如果你之前已转为字符串时间,这里就是 str;否则是时间戳
|
||||
item['es_lasttime'] = get_current_timestamp()
|
||||
item['es_loadtime'] = get_current_timestamp()
|
||||
item['es_urltitle'] = uname
|
||||
item['es_authors'] = uname
|
||||
item['es_userid'] = uid
|
||||
item['image_urls'] = img_urls
|
||||
item['file_urls'] = []
|
||||
item['es_urlname'] = url_name
|
||||
item['es_commentcount'] = comment_count
|
||||
item['es_forwardcount'] = forward_count
|
||||
item['es_likecount'] = like_count
|
||||
item['es_sitename'] = 'linkedin'
|
||||
item['es_srcname'] = 'linkedin'
|
||||
item['es_carriertype'] = 'media'
|
||||
item['es_heat'] = degree_text
|
||||
|
||||
# 判重逻辑
|
||||
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
|
||||
if self.bloom_filter.bfAdd(self.settings['LINKEDIN_FILTER_KEY'], article_id) <= 0:
|
||||
logger.info(f"跳过已采集内容: {article_id[:10]}...")
|
||||
continue
|
||||
|
||||
if not item['es_urlcontent']:
|
||||
logger.warning("跳过空内容动态")
|
||||
continue
|
||||
|
||||
article_items.append(item)
|
||||
logger.debug(f"Article {idx}: {uname} - {article_text[:30]}...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析动态失败 (index {idx}): {repr(e)}")
|
||||
continue
|
||||
|
||||
logger.info(f"用户 {uid} 共采集 {len(article_items)} 条有效动态")
|
||||
return article_items
|
||||
|
||||
def parse_linkedin_relative_time(self, text):
|
||||
"""将"1 个月前"等相对时间转为时间戳"""
|
||||
now = datetime.now()
|
||||
text = text.lower().replace(' ', '')
|
||||
|
||||
if '秒前' in text or 'secondsago' in text:
|
||||
seconds = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(seconds=seconds)).timestamp() * 1000)
|
||||
elif '分钟前' in text or 'minutesago' in text:
|
||||
minutes = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(minutes=minutes)).timestamp() * 1000)
|
||||
elif '小时前' in text or 'hoursago' in text:
|
||||
hours = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(hours=hours)).timestamp() * 1000)
|
||||
elif '天前' in text or 'daysago' in text:
|
||||
days = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(days=days)).timestamp() * 1000)
|
||||
elif '周前' in text or 'weeksago' in text:
|
||||
weeks = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(weeks=weeks)).timestamp() * 1000)
|
||||
elif '月前' in text or 'monthsago' in text:
|
||||
months = int(re.search(r'\d+', text).group())
|
||||
# 简化处理:1个月≈30天
|
||||
return int((now - timedelta(days=months * 30)).timestamp() * 1000)
|
||||
elif '年前' in text or 'yearsago' in text:
|
||||
years = int(re.search(r'\d+', text).group())
|
||||
return int((now - timedelta(days=years * 365)).timestamp() * 1000)
|
||||
else:
|
||||
return get_current_timestamp() - 86400000 # 默认24小时前
|
||||
|
||||
def extract_number(self, text):
|
||||
"""从"1,234 个赞"提取数字 1234"""
|
||||
try:
|
||||
num_str = re.search(r'[\d,]+', text).group().replace(',', '')
|
||||
return int(num_str)
|
||||
except:
|
||||
return 0
|
||||
|
||||
def linkedin_comment_parse(self, response):
|
||||
driver = response.meta['driver']
|
||||
article_id = response.meta['article_id']
|
||||
|
||||
# 点击"评论"按钮展开评论区
|
||||
try:
|
||||
comment_btn = driver.find_element(By.XPATH,
|
||||
"//button[contains(@class, 'comments-comment-button')]")
|
||||
comment_btn.click()
|
||||
time.sleep(3)
|
||||
except:
|
||||
logger.warning("未找到评论按钮,跳过评论爬取")
|
||||
return
|
||||
|
||||
# 滚动加载评论
|
||||
self.smart_scroll(driver, max_scrolls=3)
|
||||
|
||||
# 提取评论
|
||||
comment_elements = driver.find_elements(By.XPATH,
|
||||
"//div[contains(@class, 'comments-comment-item')]")
|
||||
|
||||
for comment in comment_elements:
|
||||
try:
|
||||
author = comment.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'comments-post-meta__name-text')]").text.strip()
|
||||
content = comment.find_element(By.XPATH,
|
||||
".//span[contains(@class, 'comments-comment-item-content')]").text.strip()
|
||||
comment_id = get_str_md5(f"{author}{content}")
|
||||
|
||||
item = MediaspidersItem()
|
||||
item['es_sid'] = comment_id
|
||||
item['es_hkey'] = article_id
|
||||
item['es_content'] = content
|
||||
item['es_authors'] = author
|
||||
item['es_userid'] = author
|
||||
item['es_urltime'] = get_current_timestamp()
|
||||
item['es_sitename'] = 'linkedin'
|
||||
item['es_srcname'] = 'linkedin_comment'
|
||||
item['es_carriertype'] = 'comment'
|
||||
yield item
|
||||
except:
|
||||
continue
|
||||
|
||||
# 继续处理队列中的其他评论
|
||||
if self.comment_urls:
|
||||
next_comment = self.comment_urls.pop()
|
||||
yield SeleniumRequest(
|
||||
url=next_comment['url'],
|
||||
callback=self.linkedin_comment_parse,
|
||||
meta={'article_id': next_comment['article_id'], 'driver': driver}
|
||||
)
|
||||
|
||||
def form_cookie_dict(self, cookie_str: str) -> dict:
|
||||
# 清理前缀(兼容中英文冒号)
|
||||
for prefix in ["Cookie:", "Cookie:"]:
|
||||
if cookie_str.startswith(prefix):
|
||||
cookie_str = cookie_str[len(prefix):].strip()
|
||||
break
|
||||
|
||||
cookie_dict = {}
|
||||
for item in cookie_str.split(';'):
|
||||
item = item.strip()
|
||||
if not item or '=' not in item:
|
||||
continue
|
||||
name, value = item.split('=', 1) # 仅分割第一个等号
|
||||
name, value = name.strip(), value.strip()
|
||||
# 移除 value 两端双引号(Selenium 不需要)
|
||||
if value.startswith('"') and value.endswith('"'):
|
||||
value = value[1:-1]
|
||||
cookie_dict[name] = value
|
||||
return cookie_dict
|
||||
Loading…
x
Reference in New Issue
Block a user