diff --git a/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py b/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py index 0cbcf00..a6db10b 100644 --- a/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py +++ b/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py @@ -61,6 +61,7 @@ class SeleniumMiddleware: edge_options.use_chromium = True # edge_options.add_argument("--headless") # 隐藏“受自动化软件控制”提示栏 + edge_options.add_argument('--disable-blink-features=AutomationControlled') edge_options.add_experimental_option("excludeSwitches", ["enable-automation"]) # 禁用自动化扩展 edge_options.add_experimental_option('useAutomationExtension', False) diff --git a/spiders/MediaSpiders/MediaSpiders/settings.py b/spiders/MediaSpiders/MediaSpiders/settings.py index e00a062..fa726d5 100644 --- a/spiders/MediaSpiders/MediaSpiders/settings.py +++ b/spiders/MediaSpiders/MediaSpiders/settings.py @@ -50,6 +50,7 @@ BATCH_SAVE_SIZE = 5 TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter' FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter' +LINKEDIN_FILTER_KEY = 'URL_Filter:MediaSpiders:Linkedin_Filter' YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter' WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter' WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter' @@ -57,6 +58,7 @@ FLICKR_FILTER_KEY = 'URL_Filter:MediaSpiders:Flickr_Filter' TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter' FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter' +LINKEDIN_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Linkedin_Filter' YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter' WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter' WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter' diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py new file mode 100644 index 0000000..45cae91 --- /dev/null +++ b/spiders/MediaSpiders/MediaSpiders/spiders/LinkedinSpider.py @@ -0,0 +1,408 @@ +# -*- coding: utf-8 -*- +# 标准库 +from datetime import datetime, timedelta +import json +import logging as logger +import re +import time +import hashlib + +import redis +import scrapy +from redisbloom.client import Client +from scrapy_selenium import SeleniumRequest +from selenium.webdriver.common.by import By + +from MediaSpiders.items import MediaspidersItem +from MediaSpiders.utils.string_utils import get_str_md5 +from MediaSpiders.utils.time_utils import get_current_timestamp + + +class LinkedinSpider(scrapy.Spider): + name = 'LinkedinUserSpider' + comment_urls = [] + custom_settings = { + 'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2', + 'PROTO_CLASS_NAME': 'EsSets', + 'PROTO_FIELD_NAME': 'Es', + 'PROTO_SAVE_FILE_NAME': 'public_info_data_', + 'IMAGES_STORE': r'/usr/local/temp_image/linkedin', + 'IMAGES_RESULT_FIELD': 'es_urlimage', + 'FILES_STORE': r'/usr/local/videos', + 'FILES_RESULT_FIELD': 'es_video', + 'ZIP_FILE_NAME': 'image_data_publicinfo_', + 'FILE_ZIP_FILE_NAME': 'image_data_plane_', + 'ITEM_PIPELINES': { + 'scrapy.pipelines.images.ImagesPipeline': 2, + 'scrapy.pipelines.files.FilesPipeline': 1, + 'MediaSpiders.pipelines.ProtobufSavePipeline': 300 + }, + 'DOWNLOAD_DELAY': 2, + 'SPIDER_MIDDLEWARES': { + 'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543, + 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544, + 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545, + } + } + + def __init__(self, params=None, *args, **kwargs): + super(LinkedinSpider, self).__init__(*args, **kwargs) + json_params = json.loads(params) + logger.info(json_params) + self.crawl_comment = False + self.redis_client = None + self.bloom_filter = None + self.simhash_filter_key = None + if 'job_id' in json_params: + self.job_id = json_params['job_id'] + + def start_requests(self): + self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], + password=self.settings['REDIS_PWD']) + self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], + password=self.settings['REDIS_PWD']) + self.simhash_filter_key = self.settings['LINKEDIN_SIMHASH_FILTER_KEY'] + yield SeleniumRequest(url='https://www.google.com/', callback=self.parse) + + def parse(self, response): + logger.info("login linkedin") + driver = response.request.meta['driver'] + driver.maximize_window() + # 访问主域,再设 Cookie + driver.get("https://www.linkedin.com/") + time.sleep(2) + + # 添加 Cookie(确保 domain 是 .linkedin.com) + cookie_string = self.redis_client.get("MediaSpiders:Linkedin_Cookies").decode() + cookie_dict = self.form_cookie_dict(cookie_string) # 你已有此函数 + + # 转换为 Selenium 所需格式(必须含 domain 和 path) + cookies_to_add = [] + for name, value in cookie_dict.items(): + cookies_to_add.append({ + 'name': name, + 'value': value, + 'domain': '.linkedin.com', + 'path': '/', + 'secure': True + }) + + for cookie in cookies_to_add: + try: + driver.add_cookie(cookie) + except Exception as e: + logger.warning(f"Failed to add cookie {cookie['name']}: {e}") + + driver.refresh() + time.sleep(5) + + # 获取待采集目标账号,并逐个请求 + # account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API'] + # account_query_api = account_query_api.format(sortBy="id", shuffleResult="true") + # post_data = { + # 'userType': self.settings['FACEBOOK_USER_TYPE'], + # 'userFlag': 0 + # } + # + # account_rsp = json.loads( + # http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text) + # all_user_info = [] + + # if account_rsp['code'] == 200: + # all_user_info = account_rsp['content'] + # logger.info('GET %s users' % account_rsp['message']) + # driver.set_window_size(1920, 1080) + + all_user_info = [ + {'id': 87, 'userFlag': '0', 'userName': 'andrewyng', 'userType': '2', 'userUid': 'USForcesJapan.J'}] + for user_info in all_user_info: + user_name = user_info['userName'] + # 修复2: 移除 URL 末尾空格 + current_url = f'https://www.linkedin.com/in/{user_name}/recent-activity/all/' + driver.get(current_url) + time.sleep(5) + + # 修复3: 智能滚动加载(替代固定坐标点击) + self.smart_scroll(driver, max_scrolls=5) + + # ✅ 修复 XPath:使用现代 LinkedIn 动态卡片定位方式 + current_page_articles = driver.find_elements( + By.XPATH, + "//div[contains(@class, 'feed-shared-update-v2')]" + ) + + logger.info(f"Found {len(current_page_articles)} articles for {user_name}") + items = self.get_linkedin_articles(current_page_articles, user_name, user_info['userUid']) + + for item in items: + if item.get('es_commentcount', 0) > 0: + self.comment_urls.append({ + 'url': item['es_urlname'], + 'article_id': item['es_sid'], + 'article_author': item['es_authors'], + 'article_text': item['es_urlcontent'] + }) + logger.info(f"用户 {item['es_userid']} 发文: {item['es_urlcontent'][:50]}...") + yield item + + # 评论爬取需单独设计(LinkedIn 评论需点击展开) + if self.crawl_comment and self.comment_urls: + comment_url = self.comment_urls.pop() + yield SeleniumRequest( + url=comment_url['url'], + callback=self.linkedin_comment_parse, + meta={'article_id': comment_url['article_id'], 'driver': driver} + ) + + def smart_scroll(self, driver, max_scrolls=5): + """智能滚动:检测内容增量加载""" + last_height = driver.execute_script("return document.body.scrollHeight") + for i in range(max_scrolls): + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(3) # 等待动态加载 + + # 检查是否加载了新内容 + new_height = driver.execute_script("return document.body.scrollHeight") + if new_height == last_height: + logger.info(f"滚动条 {i + 1}:未加载新内容,停止") + break + last_height = new_height + logger.info(f"滚动条 {i + 1}:加载了新内容到高度 {new_height}") + + def get_linkedin_articles(self, articles, Uname, uid): + article_items = [] + for idx, article in enumerate(articles): + try: + + # === 1. 作者姓名 === + try: + author_elem = article.find_element(By.XPATH, + ".//span[contains(@class, 'update-components-actor__title')]//span[@aria-hidden='true']") + uname = author_elem.text.strip() + except: + uname = Uname + + # === 2. 发布时间(相对时间转绝对时间戳)=== + try: + time_elem = article.find_element(By.XPATH, + ".//span[contains(@class, 'update-components-actor__sub-description')]") + relative_time = time_elem.text.split('•')[0].strip() # "1 个月前" + article_time = self.parse_linkedin_relative_time(relative_time) + except Exception as e: + logger.warning(f"Time parse failed: {e}") + article_time = get_current_timestamp() - 86400000 # 默认24小时前 + + # === 3. 正文内容(处理"展开"按钮)=== + try: + + # 提取正文(多段落合并) + content_parts = article.find_elements(By.XPATH, + ".//div[contains(@class, 'update-components-text')]//span[@dir='ltr']") + article_text = " ".join([p.text for p in content_parts if p.text.strip()]) + except: + article_text = "" + + # === 4. 文章链接(从 actor 链接提取)=== + try: + # 获取文章链接 + activity_urn = article.get_attribute("data-urn") + url_name = f"https://www.linkedin.com/feed/update/{activity_urn}" + except: + article_url = f"https://www.linkedin.com/in/{uname}/" + + # === 5. 图片提取 === + img_urls = [] + try: + img_urls = [ + img.get_attribute('data-delayed-url').strip() + for img in + article.find_elements(By.XPATH, ".//img[contains(@class, 'update-components-image__image')]") + if img.get_attribute('data-delayed-url') + ] + except: + pass + + # === 6. 互动数据(从 aria-label 提取)=== + like_count = comment_count = forward_count = 0 + try: + # 点赞数 + like_btn = article.find_element(By.XPATH, + ".//span[contains(@class, 'social-details-social-counts')]").text + like_count = self.extract_number(like_btn) + + # 评论数 + comment_btn = article.find_element(By.XPATH, ".//button[contains(@aria-label, '评论')]").text + comment_count = self.extract_number(comment_btn) + + # 转发数 + repost_btn = article.find_element(By.XPATH, + ".//button[contains(@aria-label, '转发')]").text + forward_count = self.extract_number(repost_btn) + except Exception as e: + logger.debug(f"Interaction count parse failed: {e}") + + try: + # == 7 获取是否年度影响人物(linkedin特有标识),定位到包含 "• 3 度+" 的 span + degree_span = article.find_element( + By.XPATH, + "//span[@aria-hidden='true' and contains(., '•') and contains(., '度+')]" + ) + degree_text = degree_span.text.strip() + except Exception as e: + degree_text = "" + + es_content = article_text.replace('[Original text:]', '').strip() + + # === 7. 构建 Item === + article_id = get_str_md5(f"{uname}{article_text}{article_time}") + item = MediaspidersItem() + item['es_sid'] = article_id + item['es_hkey'] = article_id + item['es_content'] = es_content + item['es_urlcontent'] = es_content + item['es_urltime'] = article_time # 注意:如果你之前已转为字符串时间,这里就是 str;否则是时间戳 + item['es_lasttime'] = get_current_timestamp() + item['es_loadtime'] = get_current_timestamp() + item['es_urltitle'] = uname + item['es_authors'] = uname + item['es_userid'] = uid + item['image_urls'] = img_urls + item['file_urls'] = [] + item['es_urlname'] = url_name + item['es_commentcount'] = comment_count + item['es_forwardcount'] = forward_count + item['es_likecount'] = like_count + item['es_sitename'] = 'linkedin' + item['es_srcname'] = 'linkedin' + item['es_carriertype'] = 'media' + item['es_heat'] = degree_text + + # 判重逻辑 + if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000: + if self.bloom_filter.bfAdd(self.settings['LINKEDIN_FILTER_KEY'], article_id) <= 0: + logger.info(f"跳过已采集内容: {article_id[:10]}...") + continue + + if not item['es_urlcontent']: + logger.warning("跳过空内容动态") + continue + + article_items.append(item) + logger.debug(f"Article {idx}: {uname} - {article_text[:30]}...") + + except Exception as e: + logger.error(f"解析动态失败 (index {idx}): {repr(e)}") + continue + + logger.info(f"用户 {uid} 共采集 {len(article_items)} 条有效动态") + return article_items + + def parse_linkedin_relative_time(self, text): + """将"1 个月前"等相对时间转为时间戳""" + now = datetime.now() + text = text.lower().replace(' ', '') + + if '秒前' in text or 'secondsago' in text: + seconds = int(re.search(r'\d+', text).group()) + return int((now - timedelta(seconds=seconds)).timestamp() * 1000) + elif '分钟前' in text or 'minutesago' in text: + minutes = int(re.search(r'\d+', text).group()) + return int((now - timedelta(minutes=minutes)).timestamp() * 1000) + elif '小时前' in text or 'hoursago' in text: + hours = int(re.search(r'\d+', text).group()) + return int((now - timedelta(hours=hours)).timestamp() * 1000) + elif '天前' in text or 'daysago' in text: + days = int(re.search(r'\d+', text).group()) + return int((now - timedelta(days=days)).timestamp() * 1000) + elif '周前' in text or 'weeksago' in text: + weeks = int(re.search(r'\d+', text).group()) + return int((now - timedelta(weeks=weeks)).timestamp() * 1000) + elif '月前' in text or 'monthsago' in text: + months = int(re.search(r'\d+', text).group()) + # 简化处理:1个月≈30天 + return int((now - timedelta(days=months * 30)).timestamp() * 1000) + elif '年前' in text or 'yearsago' in text: + years = int(re.search(r'\d+', text).group()) + return int((now - timedelta(days=years * 365)).timestamp() * 1000) + else: + return get_current_timestamp() - 86400000 # 默认24小时前 + + def extract_number(self, text): + """从"1,234 个赞"提取数字 1234""" + try: + num_str = re.search(r'[\d,]+', text).group().replace(',', '') + return int(num_str) + except: + return 0 + + def linkedin_comment_parse(self, response): + driver = response.meta['driver'] + article_id = response.meta['article_id'] + + # 点击"评论"按钮展开评论区 + try: + comment_btn = driver.find_element(By.XPATH, + "//button[contains(@class, 'comments-comment-button')]") + comment_btn.click() + time.sleep(3) + except: + logger.warning("未找到评论按钮,跳过评论爬取") + return + + # 滚动加载评论 + self.smart_scroll(driver, max_scrolls=3) + + # 提取评论 + comment_elements = driver.find_elements(By.XPATH, + "//div[contains(@class, 'comments-comment-item')]") + + for comment in comment_elements: + try: + author = comment.find_element(By.XPATH, + ".//span[contains(@class, 'comments-post-meta__name-text')]").text.strip() + content = comment.find_element(By.XPATH, + ".//span[contains(@class, 'comments-comment-item-content')]").text.strip() + comment_id = get_str_md5(f"{author}{content}") + + item = MediaspidersItem() + item['es_sid'] = comment_id + item['es_hkey'] = article_id + item['es_content'] = content + item['es_authors'] = author + item['es_userid'] = author + item['es_urltime'] = get_current_timestamp() + item['es_sitename'] = 'linkedin' + item['es_srcname'] = 'linkedin_comment' + item['es_carriertype'] = 'comment' + yield item + except: + continue + + # 继续处理队列中的其他评论 + if self.comment_urls: + next_comment = self.comment_urls.pop() + yield SeleniumRequest( + url=next_comment['url'], + callback=self.linkedin_comment_parse, + meta={'article_id': next_comment['article_id'], 'driver': driver} + ) + + def form_cookie_dict(self, cookie_str: str) -> dict: + # 清理前缀(兼容中英文冒号) + for prefix in ["Cookie:", "Cookie:"]: + if cookie_str.startswith(prefix): + cookie_str = cookie_str[len(prefix):].strip() + break + + cookie_dict = {} + for item in cookie_str.split(';'): + item = item.strip() + if not item or '=' not in item: + continue + name, value = item.split('=', 1) # 仅分割第一个等号 + name, value = name.strip(), value.strip() + # 移除 value 两端双引号(Selenium 不需要) + if value.startswith('"') and value.endswith('"'): + value = value[1:-1] + cookie_dict[name] = value + return cookie_dict \ No newline at end of file