From 959ffe6b2e2809887c46b59da4ac532ebb1f03dc Mon Sep 17 00:00:00 2001 From: DELL Date: Mon, 26 Jan 2026 16:54:28 +0800 Subject: [PATCH] =?UTF-8?q?[facebook]driver=E6=B7=BB=E5=8A=A0cookie?= =?UTF-8?q?=EF=BC=8C=E4=BC=98=E5=8C=96=E9=A1=B5=E9=9D=A2=E9=87=87=E9=9B=86?= =?UTF-8?q?=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/FacebookUserSpider.py | 160 ++++++++++++------ 1 file changed, 109 insertions(+), 51 deletions(-) diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/FacebookUserSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/FacebookUserSpider.py index 339dead..04e7473 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/FacebookUserSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/FacebookUserSpider.py @@ -9,8 +9,10 @@ import scrapy from redisbloom.client import Client from scrapy_selenium import SeleniumRequest from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.by import By from MediaSpiders.items import MediaspidersItem +from MediaSpiders.spiders.TwitterUserInfoSpider import form_cookie_dict from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.login_utils import login from MediaSpiders.utils.string_utils import get_str_md5 @@ -65,31 +67,35 @@ class FacebookSpider(scrapy.Spider): def parse(self, response): logger.info("login facebook") - # driver = response.request.meta['driver'] - # driver.maximize_window() - # driver.get('https://m.facebook.com/') - # time.sleep(3) - # user_list = [] - # for u in login_users: - # user_list.append(json.loads(u.decode())) - # login_user = random.choice(user_list) + driver = response.request.meta['driver'] + driver.maximize_window() + # 访问主域,再设 Cookie + driver.get("https://www.facebook.com/") + time.sleep(2) - # driver.find_element_by_xpath( - # '//input[@name="email"]').send_keys(login_user['uid']) - # driver.find_element_by_xpath( - # '//input[@name="pass"]').send_keys(login_user['pwd']) - # driver.find_element_by_xpath('//button[@name="login"]').click() - # time.sleep(10) - # logger.info("login as %s" % login_user['uid']) + # 添加 Cookie(确保 domain 是 .facebook.com) + cookie_string = self.redis_client.get("MediaSpiders:Facebook_Cookies").decode() + cookie_dict = form_cookie_dict(cookie_string) # 你已有此函数 - # 获取采集登录账号并登录 - login_users = self.redis_client.smembers('MediaSpiders:Facebook_login_accounts') - driver = login().login_with_selenium( - 'https://m.facebook.com/', - self.name, - login_users=login_users, - response=response - ) + # 转换为 Selenium 所需格式(必须含 domain 和 path) + cookies_to_add = [] + for name, value in cookie_dict.items(): + cookies_to_add.append({ + 'name': name, + 'value': value, + 'domain': '.facebook.com', + 'path': '/', + 'secure': True + }) + + for cookie in cookies_to_add: + try: + driver.add_cookie(cookie) + except Exception as e: + logger.warning(f"Failed to add cookie {cookie['name']}: {e}") + + driver.refresh() + time.sleep(5) # 获取待采集目标账号,并逐个请求 account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API'] @@ -98,9 +104,11 @@ class FacebookSpider(scrapy.Spider): 'userType': self.settings['FACEBOOK_USER_TYPE'], 'userFlag': 0 } + account_rsp = json.loads( http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text) all_user_info = [] + if account_rsp['code'] == 200: all_user_info = account_rsp['content'] logger.info('GET %s users' % account_rsp['message']) @@ -117,6 +125,7 @@ class FacebookSpider(scrapy.Spider): time.sleep(5) last_page_articles_count = 0 logger.info("Current URL: {}".format(current_url)) + # current_page_articles = driver.find_elements_by_xpath( "//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count) items = self.get_article(current_page_articles, uid, driver) @@ -139,51 +148,95 @@ class FacebookSpider(scrapy.Spider): for article in articles: item = MediaspidersItem() try: - uname = article.find_element_by_xpath('.//h2//strong/span').text - article_url = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]").get_attribute('href') + # === 用户名:从 h2 下的 b/span 或直接 span 提取 === + try: + uname = article.find_element_by_xpath('.//h2//b//span').text + except: + try: + uname = article.find_element_by_xpath('.//h2//span[@dir="auto"]').text + except: + uname = uid + + # === 文章链接和时间:从包含 /posts/pfbid 的 a 标签提取 === + post_link_elem = article.find_element_by_xpath(".//a[contains(@href,'/posts/pfbid')]") + article_url = post_link_elem.get_attribute('href') article_url = article_url.split("?")[0] + article_time = post_link_elem.text # 时间文本直接在 a 标签内 + + # === 展开全文(如有)=== try: clickable_fields = article.find_elements_by_xpath(".//div[@role='button']") if len(clickable_fields) > 0: for cf in clickable_fields: cf_text = cf.text - if cf_text is not None and cf_text == "展开": + if cf_text is not None and ("展开" in cf_text or "See more" in cf_text): cf.click() + time.sleep(1) break except Exception as e: logger.debug(repr(e)) - article_text_lines = article.find_elements_by_xpath(".//div[@data-ad-preview='message']") - text_info = [] - for line in article_text_lines: - text_info.append(line.text) - article_text = "".join(text_info) - article_time = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]/span").text + + # === 正文内容:使用 data-ad-rendering-role="story_message" === + try: + article_text_lines = article.find_elements_by_xpath( + ".//div[@data-ad-rendering-role='story_message']") + text_info = [] + for line in article_text_lines: + text_info.append(line.text) + article_text = "".join(text_info) + except: + article_text = "" + + # === 时间戳处理 === logger.info(f"article_time: {article_time}") - article_time = get_time_stamp( - article_time) # 这里的 article_time 必须是中文模式下的时间,比如“1天”、“5小时”等,需要登陆Facebook后切换语言 + article_time = get_time_stamp(article_time) logger.info(f"urltime: {article_time}") + + # === 图片提取 === img_urls = [] - imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img") - for img in imgs: - img_urls.append(img.get_attribute("src")) + try: + imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img") + for img in imgs: + src = img.get_attribute("src") + if src and "emoji" not in src: # 过滤 emoji 图片 + img_urls.append(src) + except: + pass + + # === 视频(暂留空)=== video_urls = [] - article_id = get_str_md5(article_text) + + # === 互动数据:点赞、评论、转发 === like_count = 0 comment_count = 0 forward_count = 0 - like_count_str = article.find_element_by_xpath( - ".//div[@data-visualcompletion='ignore-dynamic']//span[@aria-hidden='true']").text - comment_and_forward_element = article.find_elements_by_xpath(".//div[@tabindex='0']//span[@dir='auto']") + try: - if like_count_str: - like_count = int(like_count_str.replace(",", "")) - if len(comment_and_forward_element) > 1: - comment_count_str = comment_and_forward_element[0].text - forward_count_str = comment_and_forward_element[1].text - comment_count = int(comment_count_str.replace(",", "")) - forward_count = int(forward_count_str.replace(",", "")) + # 点赞数:通过 aria-label 匹配 + like_label_elem = article.find_element_by_xpath( + ".//div[@aria-label and contains(@aria-label, '赞:')]") + like_label = like_label_elem.get_attribute("aria-label") + import re + like_match = re.search(r'(\d+)', like_label) + if like_match: + like_count = int(like_match.group(1)) + except: + pass + + try: + # 评论和转发:通常在 toolbar 内的两个 span 中 + stat_spans = article.find_elements_by_xpath( + ".//div[@role='toolbar']//span[@class='xt0b8zv x135b78x']") + if len(stat_spans) >= 2: + comment_count = int(stat_spans[0].text.replace(",", "")) if stat_spans[0].text.replace(",", + "").isdigit() else 0 + forward_count = int(stat_spans[1].text.replace(",", "")) if stat_spans[1].text.replace(",", + "").isdigit() else 0 except: logger.warning("获取点赞/评论/转发数量异常") + + # === 填充 Item === + article_id = get_str_md5(article_text) item['es_sid'] = str(article_id) item['es_hkey'] = str(article_id) item['es_content'] = str(article_text).replace('查看翻译', '') @@ -203,20 +256,25 @@ class FacebookSpider(scrapy.Spider): item['es_sitename'] = 'facebook' item['es_srcname'] = 'facebook' item['es_carriertype'] = 'media' - # 判重 + + # === 判重逻辑 === if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000: if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0: logger.info("跳过已采集内容") continue + if item['es_urlcontent'].endswith('展开'): logger.info("跳过未展开的内容") continue + article_items.append(item) + except Exception as e: - logger.debug(repr(e)) + logger.debug("解析单条帖子失败: %s" % repr(e)) + continue + logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items))) return article_items - def comment_parse(self, response): browser = response.request.meta['driver'] article_id = response.request.meta['article_id']