[facebook]driver添加cookie,优化页面采集流程
This commit is contained in:
parent
93a8ff5ef4
commit
959ffe6b2e
@ -9,8 +9,10 @@ import scrapy
|
|||||||
from redisbloom.client import Client
|
from redisbloom.client import Client
|
||||||
from scrapy_selenium import SeleniumRequest
|
from scrapy_selenium import SeleniumRequest
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
from MediaSpiders.items import MediaspidersItem
|
from MediaSpiders.items import MediaspidersItem
|
||||||
|
from MediaSpiders.spiders.TwitterUserInfoSpider import form_cookie_dict
|
||||||
from MediaSpiders.utils.http_utils import http_post
|
from MediaSpiders.utils.http_utils import http_post
|
||||||
from MediaSpiders.utils.login_utils import login
|
from MediaSpiders.utils.login_utils import login
|
||||||
from MediaSpiders.utils.string_utils import get_str_md5
|
from MediaSpiders.utils.string_utils import get_str_md5
|
||||||
@ -65,31 +67,35 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
logger.info("login facebook")
|
logger.info("login facebook")
|
||||||
# driver = response.request.meta['driver']
|
driver = response.request.meta['driver']
|
||||||
# driver.maximize_window()
|
driver.maximize_window()
|
||||||
# driver.get('https://m.facebook.com/')
|
# 访问主域,再设 Cookie
|
||||||
# time.sleep(3)
|
driver.get("https://www.facebook.com/")
|
||||||
# user_list = []
|
time.sleep(2)
|
||||||
# for u in login_users:
|
|
||||||
# user_list.append(json.loads(u.decode()))
|
|
||||||
# login_user = random.choice(user_list)
|
|
||||||
|
|
||||||
# driver.find_element_by_xpath(
|
# 添加 Cookie(确保 domain 是 .facebook.com)
|
||||||
# '//input[@name="email"]').send_keys(login_user['uid'])
|
cookie_string = self.redis_client.get("MediaSpiders:Facebook_Cookies").decode()
|
||||||
# driver.find_element_by_xpath(
|
cookie_dict = form_cookie_dict(cookie_string) # 你已有此函数
|
||||||
# '//input[@name="pass"]').send_keys(login_user['pwd'])
|
|
||||||
# driver.find_element_by_xpath('//button[@name="login"]').click()
|
|
||||||
# time.sleep(10)
|
|
||||||
# logger.info("login as %s" % login_user['uid'])
|
|
||||||
|
|
||||||
# 获取采集登录账号并登录
|
# 转换为 Selenium 所需格式(必须含 domain 和 path)
|
||||||
login_users = self.redis_client.smembers('MediaSpiders:Facebook_login_accounts')
|
cookies_to_add = []
|
||||||
driver = login().login_with_selenium(
|
for name, value in cookie_dict.items():
|
||||||
'https://m.facebook.com/',
|
cookies_to_add.append({
|
||||||
self.name,
|
'name': name,
|
||||||
login_users=login_users,
|
'value': value,
|
||||||
response=response
|
'domain': '.facebook.com',
|
||||||
)
|
'path': '/',
|
||||||
|
'secure': True
|
||||||
|
})
|
||||||
|
|
||||||
|
for cookie in cookies_to_add:
|
||||||
|
try:
|
||||||
|
driver.add_cookie(cookie)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
|
||||||
|
|
||||||
|
driver.refresh()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
# 获取待采集目标账号,并逐个请求
|
# 获取待采集目标账号,并逐个请求
|
||||||
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
||||||
@ -98,9 +104,11 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
'userType': self.settings['FACEBOOK_USER_TYPE'],
|
'userType': self.settings['FACEBOOK_USER_TYPE'],
|
||||||
'userFlag': 0
|
'userFlag': 0
|
||||||
}
|
}
|
||||||
|
|
||||||
account_rsp = json.loads(
|
account_rsp = json.loads(
|
||||||
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||||||
all_user_info = []
|
all_user_info = []
|
||||||
|
|
||||||
if account_rsp['code'] == 200:
|
if account_rsp['code'] == 200:
|
||||||
all_user_info = account_rsp['content']
|
all_user_info = account_rsp['content']
|
||||||
logger.info('GET %s users' % account_rsp['message'])
|
logger.info('GET %s users' % account_rsp['message'])
|
||||||
@ -117,6 +125,7 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
last_page_articles_count = 0
|
last_page_articles_count = 0
|
||||||
logger.info("Current URL: {}".format(current_url))
|
logger.info("Current URL: {}".format(current_url))
|
||||||
|
#
|
||||||
current_page_articles = driver.find_elements_by_xpath(
|
current_page_articles = driver.find_elements_by_xpath(
|
||||||
"//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count)
|
"//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count)
|
||||||
items = self.get_article(current_page_articles, uid, driver)
|
items = self.get_article(current_page_articles, uid, driver)
|
||||||
@ -139,51 +148,95 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
for article in articles:
|
for article in articles:
|
||||||
item = MediaspidersItem()
|
item = MediaspidersItem()
|
||||||
try:
|
try:
|
||||||
uname = article.find_element_by_xpath('.//h2//strong/span').text
|
# === 用户名:从 h2 下的 b/span 或直接 span 提取 ===
|
||||||
article_url = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]").get_attribute('href')
|
try:
|
||||||
|
uname = article.find_element_by_xpath('.//h2//b//span').text
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
uname = article.find_element_by_xpath('.//h2//span[@dir="auto"]').text
|
||||||
|
except:
|
||||||
|
uname = uid
|
||||||
|
|
||||||
|
# === 文章链接和时间:从包含 /posts/pfbid 的 a 标签提取 ===
|
||||||
|
post_link_elem = article.find_element_by_xpath(".//a[contains(@href,'/posts/pfbid')]")
|
||||||
|
article_url = post_link_elem.get_attribute('href')
|
||||||
article_url = article_url.split("?")[0]
|
article_url = article_url.split("?")[0]
|
||||||
|
article_time = post_link_elem.text # 时间文本直接在 a 标签内
|
||||||
|
|
||||||
|
# === 展开全文(如有)===
|
||||||
try:
|
try:
|
||||||
clickable_fields = article.find_elements_by_xpath(".//div[@role='button']")
|
clickable_fields = article.find_elements_by_xpath(".//div[@role='button']")
|
||||||
if len(clickable_fields) > 0:
|
if len(clickable_fields) > 0:
|
||||||
for cf in clickable_fields:
|
for cf in clickable_fields:
|
||||||
cf_text = cf.text
|
cf_text = cf.text
|
||||||
if cf_text is not None and cf_text == "展开":
|
if cf_text is not None and ("展开" in cf_text or "See more" in cf_text):
|
||||||
cf.click()
|
cf.click()
|
||||||
|
time.sleep(1)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(repr(e))
|
logger.debug(repr(e))
|
||||||
article_text_lines = article.find_elements_by_xpath(".//div[@data-ad-preview='message']")
|
|
||||||
|
# === 正文内容:使用 data-ad-rendering-role="story_message" ===
|
||||||
|
try:
|
||||||
|
article_text_lines = article.find_elements_by_xpath(
|
||||||
|
".//div[@data-ad-rendering-role='story_message']")
|
||||||
text_info = []
|
text_info = []
|
||||||
for line in article_text_lines:
|
for line in article_text_lines:
|
||||||
text_info.append(line.text)
|
text_info.append(line.text)
|
||||||
article_text = "".join(text_info)
|
article_text = "".join(text_info)
|
||||||
article_time = article.find_element_by_xpath(".//a[contains(@href,'/posts/')]/span").text
|
except:
|
||||||
|
article_text = ""
|
||||||
|
|
||||||
|
# === 时间戳处理 ===
|
||||||
logger.info(f"article_time: {article_time}")
|
logger.info(f"article_time: {article_time}")
|
||||||
article_time = get_time_stamp(
|
article_time = get_time_stamp(article_time)
|
||||||
article_time) # 这里的 article_time 必须是中文模式下的时间,比如“1天”、“5小时”等,需要登陆Facebook后切换语言
|
|
||||||
logger.info(f"urltime: {article_time}")
|
logger.info(f"urltime: {article_time}")
|
||||||
|
|
||||||
|
# === 图片提取 ===
|
||||||
img_urls = []
|
img_urls = []
|
||||||
|
try:
|
||||||
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
|
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
img_urls.append(img.get_attribute("src"))
|
src = img.get_attribute("src")
|
||||||
|
if src and "emoji" not in src: # 过滤 emoji 图片
|
||||||
|
img_urls.append(src)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# === 视频(暂留空)===
|
||||||
video_urls = []
|
video_urls = []
|
||||||
article_id = get_str_md5(article_text)
|
|
||||||
|
# === 互动数据:点赞、评论、转发 ===
|
||||||
like_count = 0
|
like_count = 0
|
||||||
comment_count = 0
|
comment_count = 0
|
||||||
forward_count = 0
|
forward_count = 0
|
||||||
like_count_str = article.find_element_by_xpath(
|
|
||||||
".//div[@data-visualcompletion='ignore-dynamic']//span[@aria-hidden='true']").text
|
|
||||||
comment_and_forward_element = article.find_elements_by_xpath(".//div[@tabindex='0']//span[@dir='auto']")
|
|
||||||
try:
|
try:
|
||||||
if like_count_str:
|
# 点赞数:通过 aria-label 匹配
|
||||||
like_count = int(like_count_str.replace(",", ""))
|
like_label_elem = article.find_element_by_xpath(
|
||||||
if len(comment_and_forward_element) > 1:
|
".//div[@aria-label and contains(@aria-label, '赞:')]")
|
||||||
comment_count_str = comment_and_forward_element[0].text
|
like_label = like_label_elem.get_attribute("aria-label")
|
||||||
forward_count_str = comment_and_forward_element[1].text
|
import re
|
||||||
comment_count = int(comment_count_str.replace(",", ""))
|
like_match = re.search(r'(\d+)', like_label)
|
||||||
forward_count = int(forward_count_str.replace(",", ""))
|
if like_match:
|
||||||
|
like_count = int(like_match.group(1))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 评论和转发:通常在 toolbar 内的两个 span 中
|
||||||
|
stat_spans = article.find_elements_by_xpath(
|
||||||
|
".//div[@role='toolbar']//span[@class='xt0b8zv x135b78x']")
|
||||||
|
if len(stat_spans) >= 2:
|
||||||
|
comment_count = int(stat_spans[0].text.replace(",", "")) if stat_spans[0].text.replace(",",
|
||||||
|
"").isdigit() else 0
|
||||||
|
forward_count = int(stat_spans[1].text.replace(",", "")) if stat_spans[1].text.replace(",",
|
||||||
|
"").isdigit() else 0
|
||||||
except:
|
except:
|
||||||
logger.warning("获取点赞/评论/转发数量异常")
|
logger.warning("获取点赞/评论/转发数量异常")
|
||||||
|
|
||||||
|
# === 填充 Item ===
|
||||||
|
article_id = get_str_md5(article_text)
|
||||||
item['es_sid'] = str(article_id)
|
item['es_sid'] = str(article_id)
|
||||||
item['es_hkey'] = str(article_id)
|
item['es_hkey'] = str(article_id)
|
||||||
item['es_content'] = str(article_text).replace('查看翻译', '')
|
item['es_content'] = str(article_text).replace('查看翻译', '')
|
||||||
@ -203,20 +256,25 @@ class FacebookSpider(scrapy.Spider):
|
|||||||
item['es_sitename'] = 'facebook'
|
item['es_sitename'] = 'facebook'
|
||||||
item['es_srcname'] = 'facebook'
|
item['es_srcname'] = 'facebook'
|
||||||
item['es_carriertype'] = 'media'
|
item['es_carriertype'] = 'media'
|
||||||
# 判重
|
|
||||||
|
# === 判重逻辑 ===
|
||||||
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
|
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
|
||||||
if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0:
|
if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0:
|
||||||
logger.info("跳过已采集内容")
|
logger.info("跳过已采集内容")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if item['es_urlcontent'].endswith('展开'):
|
if item['es_urlcontent'].endswith('展开'):
|
||||||
logger.info("跳过未展开的内容")
|
logger.info("跳过未展开的内容")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
article_items.append(item)
|
article_items.append(item)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(repr(e))
|
logger.debug("解析单条帖子失败: %s" % repr(e))
|
||||||
|
continue
|
||||||
|
|
||||||
logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items)))
|
logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items)))
|
||||||
return article_items
|
return article_items
|
||||||
|
|
||||||
def comment_parse(self, response):
|
def comment_parse(self, response):
|
||||||
browser = response.request.meta['driver']
|
browser = response.request.meta['driver']
|
||||||
article_id = response.request.meta['article_id']
|
article_id = response.request.meta['article_id']
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user