osc/spiders/MediaSpiders/MediaSpiders/spiders/FacebookUserSpider.py

339 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import json
import logging as logger
import random
import time
import redis
import scrapy
from redisbloom.client import Client
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.action_chains import ActionChains
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.spiders.TwitterUserInfoSpider import form_cookie_dict
from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.string_utils import get_str_md5
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
class FacebookSpider(scrapy.Spider):
name = 'FacebookUserSpider'
comment_urls = []
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/facebook',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1,
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
},
'DOWNLOAD_DELAY': 2,
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
}
}
def __init__(self, params=None, *args, **kwargs):
super(FacebookSpider, self).__init__(*args, **kwargs)
json_params = json.loads(params)
logger.info(json_params)
self.crawl_comment = False
self.redis_client = None
self.bloom_filter = None
self.simhash_filter_key = None
if 'job_id' in json_params:
self.job_id = json_params['job_id']
def start_requests(self):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['FACEBOOK_SIMHASH_FILTER_KEY']
yield SeleniumRequest(url='https://www.google.com/', callback=self.parse)
def parse(self, response):
logger.info("login facebook")
driver = response.request.meta['driver']
driver.maximize_window()
# 访问主域,再设 Cookie
driver.get("https://www.facebook.com/")
time.sleep(2)
# 添加 Cookie确保 domain 是 .facebook.com
cookie_string = self.redis_client.get("MediaSpiders:Facebook_Cookies").decode()
cookie_dict = form_cookie_dict(cookie_string) # 你已有此函数
# 转换为 Selenium 所需格式(必须含 domain 和 path
cookies_to_add = []
for name, value in cookie_dict.items():
cookies_to_add.append({
'name': name,
'value': value,
'domain': '.facebook.com',
'path': '/',
'secure': True
})
for cookie in cookies_to_add:
try:
driver.add_cookie(cookie)
except Exception as e:
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
driver.refresh()
time.sleep(5)
# 获取待采集目标账号,并逐个请求
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
post_data = {
'userType': self.settings['FACEBOOK_USER_TYPE'],
'userFlag': 0
}
account_rsp = json.loads(
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
all_user_info = []
if account_rsp['code'] == 200:
all_user_info = account_rsp['content']
logger.info('GET %s users' % account_rsp['message'])
driver.set_window_size(1920, 1080)
for user_info in all_user_info:
uid = user_info['userUid']
current_url = 'https://www.facebook.com/%s' % uid
driver.get(current_url)
try:
ActionChains(driver).move_by_offset(200, 100).click().perform()
except Exception as e:
logger.warning("Exception: {},点击页面时有元素被遮挡".format(repr(e)))
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(5)
last_page_articles_count = 0
logger.info("Current URL: {}".format(current_url))
#
current_page_articles = driver.find_elements_by_xpath(
"//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count)
items = self.get_article(current_page_articles, uid, driver)
# 如果items为空说明爬取到的是重复内容
for item in items:
if 'es_commentcount' in item and int(item['es_commentcount']) > 0:
self.comment_urls.append(
{'url': item['es_urlname'], 'article_id': item['es_sid'], 'article_author': item['es_authors'],
'article_text': item['es_urlcontent']})
logger.info("用户 {} 发文内容: {}".format(item['es_userid'], item['es_urlcontent']))
yield item
if self.crawl_comment and len(self.comment_urls) > 0:
comment_url = self.comment_urls.pop()
yield SeleniumRequest(url=comment_url['url'], callback=self.comment_parse,
meta={'article_id': comment_url['article_id']})
def get_article(self, articles, uid, browser):
browser.maximize_window()
article_items = []
for article in articles:
item = MediaspidersItem()
try:
# === 用户名:从 h2 下的 b/span 或直接 span 提取 ===
try:
uname = article.find_element_by_xpath('.//h2//b//span').text
except:
try:
uname = article.find_element_by_xpath('.//h2//span[@dir="auto"]').text
except:
uname = uid
# === 文章链接和时间:从包含 /posts/pfbid 的 a 标签提取 ===
post_link_elem = article.find_element_by_xpath(".//a[contains(@href,'/posts/pfbid')]")
article_url = post_link_elem.get_attribute('href')
article_url = article_url.split("?")[0]
article_time = post_link_elem.text # 时间文本直接在 a 标签内
# === 展开全文(如有)===
try:
clickable_fields = article.find_elements_by_xpath(".//div[@role='button']")
if len(clickable_fields) > 0:
for cf in clickable_fields:
cf_text = cf.text
if cf_text is not None and ("展开" in cf_text or "See more" in cf_text):
cf.click()
time.sleep(1)
break
except Exception as e:
logger.debug(repr(e))
# === 正文内容:使用 data-ad-rendering-role="story_message" ===
try:
article_text_lines = article.find_elements_by_xpath(
".//div[@data-ad-rendering-role='story_message']")
text_info = []
for line in article_text_lines:
text_info.append(line.text)
article_text = "".join(text_info)
except:
article_text = ""
# === 时间戳处理 ===
logger.info(f"article_time: {article_time}")
article_time = get_time_stamp(article_time)
logger.info(f"urltime: {article_time}")
# === 图片提取 ===
img_urls = []
try:
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
for img in imgs:
src = img.get_attribute("src")
if src and "emoji" not in src: # 过滤 emoji 图片
img_urls.append(src)
except:
pass
# === 视频(暂留空)===
video_urls = []
# === 互动数据:点赞、评论、转发 ===
like_count = 0
comment_count = 0
forward_count = 0
try:
# 点赞数:通过 aria-label 匹配
like_label_elem = article.find_element_by_xpath(
".//div[@aria-label and contains(@aria-label, '赞:')]")
like_label = like_label_elem.get_attribute("aria-label")
import re
like_match = re.search(r'(\d+)', like_label)
if like_match:
like_count = int(like_match.group(1))
except:
pass
try:
# 评论和转发:通常在 toolbar 内的两个 span 中
stat_spans = article.find_elements_by_xpath(
".//div[@role='toolbar']//span[@class='xt0b8zv x135b78x']")
if len(stat_spans) >= 2:
comment_count = int(stat_spans[0].text.replace(",", "")) if stat_spans[0].text.replace(",",
"").isdigit() else 0
forward_count = int(stat_spans[1].text.replace(",", "")) if stat_spans[1].text.replace(",",
"").isdigit() else 0
except:
logger.warning("获取点赞/评论/转发数量异常")
# === 填充 Item ===
article_id = get_str_md5(article_text)
item['es_sid'] = str(article_id)
item['es_hkey'] = str(article_id)
item['es_content'] = str(article_text).replace('查看翻译', '')
item['es_urlcontent'] = str(article_text).replace('查看翻译', '')
item['es_urltime'] = article_time
item['es_lasttime'] = get_current_timestamp()
item['es_loadtime'] = get_current_timestamp()
item['es_urltitle'] = uname
item['es_authors'] = uid
item['es_userid'] = uid
item['image_urls'] = img_urls
item['file_urls'] = video_urls
item['es_urlname'] = article_url
item['es_commentcount'] = comment_count
item['es_forwardcount'] = forward_count
item['es_likecount'] = like_count
item['es_sitename'] = 'facebook'
item['es_srcname'] = 'facebook'
item['es_carriertype'] = 'media'
# === 判重逻辑 ===
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0:
logger.info("跳过已采集内容")
continue
if item['es_urlcontent'].endswith('展开'):
logger.info("跳过未展开的内容")
continue
article_items.append(item)
except Exception as e:
logger.debug("解析单条帖子失败: %s" % repr(e))
continue
logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items)))
return article_items
def comment_parse(self, response):
browser = response.request.meta['driver']
article_id = response.request.meta['article_id']
article_author = response.request.meta['article_id']
article_id = response.request.meta['article_id']
# 加载所有评论
browser.execute_script(
'window.scrollTo(0, document.body.scrollHeight)')
see_next = False
try:
see_next = browser.find_element_by_xpath(
'//div[contains(@id,"see_next")]')
except:
pass
while see_next:
body_height1 = browser.execute_script(
'return document.body.scrollHeight')
see_next.click()
time.sleep(1)
browser.execute_script(
'window.scrollTo(0, document.body.scrollHeight)')
body_height2 = browser.execute_script(
'return document.body.scrollHeight')
if body_height1 == body_height2:
break
try:
see_next = browser.find_element_by_xpath(
'//div[contains(@id,"see_next")]')
except:
see_next = False
# 获取所有评论
comment_elements = browser.find_elements_by_xpath(
'//div[@data-sigil="comment"]')
for comment_e in comment_elements:
comment_user_id = comment_e.get_attribute('id')
comment_user_name = comment_e.find_element_by_xpath(
'./div[2]/div/div//a | .//div[@data-sigil="comment-body"]/a').text
comment_body = comment_e.find_element_by_xpath(
'.//div[@data-sigil="comment-body"]')
comment_id = comment_body.get_attribute(
'data-commentid').split('_')[1]
comment_content = comment_body.text
comment_item = MediaspidersItem()
comment_item['es_authors'] = comment_user_name
comment_item['es_sid'] = comment_id
comment_item['es_content'] = comment_content
comment_item['es_hkey'] = str(article_id)
comment_item['es_catalog2'] = article_author
comment_item['es_sitename'] = 'facebook'
comment_item['es_srcname'] = 'facebook_comment'
comment_item['es_carriertype'] = 'facebook_comment'
comment_user_url_a = comment_e.find_element_by_xpath(
'./div[1]/div/a')
if comment_user_url_a:
comment_user_url = comment_user_url_a.get_attribute('href')
self.redis_client.sadd('URL_Filter:MediaSpiders:Facebook_Comment_User_Filter', comment_user_url)
yield comment_item
# 请求下一篇发文的评论
if len(self.comment_urls) > 0:
comment_url = self.comment_urls.pop()
yield SeleniumRequest(url=comment_url['url'], callback=self.comment_parse,
meta={'article_id': comment_url['article_id']})