[facebook]driver添加cookie,优化页面采集流程

This commit is contained in:
DELL 2026-01-26 16:54:28 +08:00
parent 93a8ff5ef4
commit 959ffe6b2e

View File

@ -9,8 +9,10 @@ import scrapy
from redisbloom.client import Client from redisbloom.client import Client
from scrapy_selenium import SeleniumRequest from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from MediaSpiders.items import MediaspidersItem from MediaSpiders.items import MediaspidersItem
from MediaSpiders.spiders.TwitterUserInfoSpider import form_cookie_dict
from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.login_utils import login from MediaSpiders.utils.login_utils import login
from MediaSpiders.utils.string_utils import get_str_md5 from MediaSpiders.utils.string_utils import get_str_md5
@ -65,31 +67,35 @@ class FacebookSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
logger.info("login facebook") logger.info("login facebook")
# driver = response.request.meta['driver'] driver = response.request.meta['driver']
# driver.maximize_window() driver.maximize_window()
# driver.get('https://m.facebook.com/') # 访问主域,再设 Cookie
# time.sleep(3) driver.get("https://www.facebook.com/")
# user_list = [] time.sleep(2)
# for u in login_users:
# user_list.append(json.loads(u.decode()))
# login_user = random.choice(user_list)
# driver.find_element_by_xpath( # 添加 Cookie确保 domain 是 .facebook.com
# '//input[@name="email"]').send_keys(login_user['uid']) cookie_string = self.redis_client.get("MediaSpiders:Facebook_Cookies").decode()
# driver.find_element_by_xpath( cookie_dict = form_cookie_dict(cookie_string) # 你已有此函数
# '//input[@name="pass"]').send_keys(login_user['pwd'])
# driver.find_element_by_xpath('//button[@name="login"]').click()
# time.sleep(10)
# logger.info("login as %s" % login_user['uid'])
# 获取采集登录账号并登录 # 转换为 Selenium 所需格式(必须含 domain 和 path
login_users = self.redis_client.smembers('MediaSpiders:Facebook_login_accounts') cookies_to_add = []
driver = login().login_with_selenium( for name, value in cookie_dict.items():
'https://m.facebook.com/', cookies_to_add.append({
self.name, 'name': name,
login_users=login_users, 'value': value,
response=response 'domain': '.facebook.com',
) 'path': '/',
'secure': True
})
for cookie in cookies_to_add:
try:
driver.add_cookie(cookie)
except Exception as e:
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
driver.refresh()
time.sleep(5)
# 获取待采集目标账号,并逐个请求 # 获取待采集目标账号,并逐个请求
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API'] account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
@ -98,9 +104,11 @@ class FacebookSpider(scrapy.Spider):
'userType': self.settings['FACEBOOK_USER_TYPE'], 'userType': self.settings['FACEBOOK_USER_TYPE'],
'userFlag': 0 'userFlag': 0
} }
account_rsp = json.loads( account_rsp = json.loads(
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text) http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
all_user_info = [] all_user_info = []
if account_rsp['code'] == 200: if account_rsp['code'] == 200:
all_user_info = account_rsp['content'] all_user_info = account_rsp['content']
logger.info('GET %s users' % account_rsp['message']) logger.info('GET %s users' % account_rsp['message'])
@ -117,6 +125,7 @@ class FacebookSpider(scrapy.Spider):
time.sleep(5) time.sleep(5)
last_page_articles_count = 0 last_page_articles_count = 0
logger.info("Current URL: {}".format(current_url)) logger.info("Current URL: {}".format(current_url))
#
current_page_articles = driver.find_elements_by_xpath( current_page_articles = driver.find_elements_by_xpath(
"//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count) "//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count)
items = self.get_article(current_page_articles, uid, driver) items = self.get_article(current_page_articles, uid, driver)
@ -139,51 +148,95 @@ class FacebookSpider(scrapy.Spider):
for article in articles: for article in articles:
item = MediaspidersItem() item = MediaspidersItem()
try: try:
uname = article.find_element_by_xpath('.//h2//strong/span').text # === 用户名:从 h2 下的 b/span 或直接 span 提取 ===
article_url = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]").get_attribute('href') try:
uname = article.find_element_by_xpath('.//h2//b//span').text
except:
try:
uname = article.find_element_by_xpath('.//h2//span[@dir="auto"]').text
except:
uname = uid
# === 文章链接和时间:从包含 /posts/pfbid 的 a 标签提取 ===
post_link_elem = article.find_element_by_xpath(".//a[contains(@href,'/posts/pfbid')]")
article_url = post_link_elem.get_attribute('href')
article_url = article_url.split("?")[0] article_url = article_url.split("?")[0]
article_time = post_link_elem.text # 时间文本直接在 a 标签内
# === 展开全文(如有)===
try: try:
clickable_fields = article.find_elements_by_xpath(".//div[@role='button']") clickable_fields = article.find_elements_by_xpath(".//div[@role='button']")
if len(clickable_fields) > 0: if len(clickable_fields) > 0:
for cf in clickable_fields: for cf in clickable_fields:
cf_text = cf.text cf_text = cf.text
if cf_text is not None and cf_text == "展开": if cf_text is not None and ("展开" in cf_text or "See more" in cf_text):
cf.click() cf.click()
time.sleep(1)
break break
except Exception as e: except Exception as e:
logger.debug(repr(e)) logger.debug(repr(e))
article_text_lines = article.find_elements_by_xpath(".//div[@data-ad-preview='message']")
text_info = [] # === 正文内容:使用 data-ad-rendering-role="story_message" ===
for line in article_text_lines: try:
text_info.append(line.text) article_text_lines = article.find_elements_by_xpath(
article_text = "".join(text_info) ".//div[@data-ad-rendering-role='story_message']")
article_time = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]/span").text text_info = []
for line in article_text_lines:
text_info.append(line.text)
article_text = "".join(text_info)
except:
article_text = ""
# === 时间戳处理 ===
logger.info(f"article_time: {article_time}") logger.info(f"article_time: {article_time}")
article_time = get_time_stamp( article_time = get_time_stamp(article_time)
article_time) # 这里的 article_time 必须是中文模式下的时间比如“1天”、“5小时”等需要登陆Facebook后切换语言
logger.info(f"urltime: {article_time}") logger.info(f"urltime: {article_time}")
# === 图片提取 ===
img_urls = [] img_urls = []
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img") try:
for img in imgs: imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
img_urls.append(img.get_attribute("src")) for img in imgs:
src = img.get_attribute("src")
if src and "emoji" not in src: # 过滤 emoji 图片
img_urls.append(src)
except:
pass
# === 视频(暂留空)===
video_urls = [] video_urls = []
article_id = get_str_md5(article_text)
# === 互动数据:点赞、评论、转发 ===
like_count = 0 like_count = 0
comment_count = 0 comment_count = 0
forward_count = 0 forward_count = 0
like_count_str = article.find_element_by_xpath(
".//div[@data-visualcompletion='ignore-dynamic']//span[@aria-hidden='true']").text
comment_and_forward_element = article.find_elements_by_xpath(".//div[@tabindex='0']//span[@dir='auto']")
try: try:
if like_count_str: # 点赞数:通过 aria-label 匹配
like_count = int(like_count_str.replace(",", "")) like_label_elem = article.find_element_by_xpath(
if len(comment_and_forward_element) > 1: ".//div[@aria-label and contains(@aria-label, '赞:')]")
comment_count_str = comment_and_forward_element[0].text like_label = like_label_elem.get_attribute("aria-label")
forward_count_str = comment_and_forward_element[1].text import re
comment_count = int(comment_count_str.replace(",", "")) like_match = re.search(r'(\d+)', like_label)
forward_count = int(forward_count_str.replace(",", "")) if like_match:
like_count = int(like_match.group(1))
except:
pass
try:
# 评论和转发:通常在 toolbar 内的两个 span 中
stat_spans = article.find_elements_by_xpath(
".//div[@role='toolbar']//span[@class='xt0b8zv x135b78x']")
if len(stat_spans) >= 2:
comment_count = int(stat_spans[0].text.replace(",", "")) if stat_spans[0].text.replace(",",
"").isdigit() else 0
forward_count = int(stat_spans[1].text.replace(",", "")) if stat_spans[1].text.replace(",",
"").isdigit() else 0
except: except:
logger.warning("获取点赞/评论/转发数量异常") logger.warning("获取点赞/评论/转发数量异常")
# === 填充 Item ===
article_id = get_str_md5(article_text)
item['es_sid'] = str(article_id) item['es_sid'] = str(article_id)
item['es_hkey'] = str(article_id) item['es_hkey'] = str(article_id)
item['es_content'] = str(article_text).replace('查看翻译', '') item['es_content'] = str(article_text).replace('查看翻译', '')
@ -203,20 +256,25 @@ class FacebookSpider(scrapy.Spider):
item['es_sitename'] = 'facebook' item['es_sitename'] = 'facebook'
item['es_srcname'] = 'facebook' item['es_srcname'] = 'facebook'
item['es_carriertype'] = 'media' item['es_carriertype'] = 'media'
# 判重
# === 判重逻辑 ===
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000: if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0: if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0:
logger.info("跳过已采集内容") logger.info("跳过已采集内容")
continue continue
if item['es_urlcontent'].endswith('展开'): if item['es_urlcontent'].endswith('展开'):
logger.info("跳过未展开的内容") logger.info("跳过未展开的内容")
continue continue
article_items.append(item) article_items.append(item)
except Exception as e: except Exception as e:
logger.debug(repr(e)) logger.debug("解析单条帖子失败: %s" % repr(e))
continue
logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items))) logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items)))
return article_items return article_items
def comment_parse(self, response): def comment_parse(self, response):
browser = response.request.meta['driver'] browser = response.request.meta['driver']
article_id = response.request.meta['article_id'] article_id = response.request.meta['article_id']