339 lines
15 KiB
Python
339 lines
15 KiB
Python
# -*- coding: utf-8 -*-
|
||
import json
|
||
import logging as logger
|
||
import random
|
||
import time
|
||
|
||
import redis
|
||
import scrapy
|
||
from redisbloom.client import Client
|
||
from scrapy_selenium import SeleniumRequest
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
|
||
from MediaSpiders.items import MediaspidersItem
|
||
from MediaSpiders.spiders.TwitterUserInfoSpider import form_cookie_dict
|
||
from MediaSpiders.utils.http_utils import http_post
|
||
from MediaSpiders.utils.string_utils import get_str_md5
|
||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||
|
||
|
||
class FacebookSpider(scrapy.Spider):
|
||
name = 'FacebookUserSpider'
|
||
comment_urls = []
|
||
custom_settings = {
|
||
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
||
'PROTO_CLASS_NAME': 'EsSets',
|
||
'PROTO_FIELD_NAME': 'Es',
|
||
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
||
'IMAGES_STORE': r'/usr/local/temp_image/facebook',
|
||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||
'FILES_STORE': r'/usr/local/videos',
|
||
'FILES_RESULT_FIELD': 'es_video',
|
||
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||
'ITEM_PIPELINES': {
|
||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||
'scrapy.pipelines.files.FilesPipeline': 1,
|
||
'MediaSpiders.pipelines.ProtobufSavePipeline': 300
|
||
},
|
||
'DOWNLOAD_DELAY': 2,
|
||
'SPIDER_MIDDLEWARES': {
|
||
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
||
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
||
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
||
}
|
||
}
|
||
|
||
def __init__(self, params=None, *args, **kwargs):
|
||
super(FacebookSpider, self).__init__(*args, **kwargs)
|
||
json_params = json.loads(params)
|
||
logger.info(json_params)
|
||
self.crawl_comment = False
|
||
self.redis_client = None
|
||
self.bloom_filter = None
|
||
self.simhash_filter_key = None
|
||
if 'job_id' in json_params:
|
||
self.job_id = json_params['job_id']
|
||
|
||
def start_requests(self):
|
||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||
password=self.settings['REDIS_PWD'])
|
||
self.bloom_filter = Client(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||
password=self.settings['REDIS_PWD'])
|
||
self.simhash_filter_key = self.settings['FACEBOOK_SIMHASH_FILTER_KEY']
|
||
yield SeleniumRequest(url='https://www.google.com/', callback=self.parse)
|
||
|
||
def parse(self, response):
|
||
logger.info("login facebook")
|
||
driver = response.request.meta['driver']
|
||
driver.maximize_window()
|
||
# 访问主域,再设 Cookie
|
||
driver.get("https://www.facebook.com/")
|
||
time.sleep(2)
|
||
|
||
# 添加 Cookie(确保 domain 是 .facebook.com)
|
||
cookie_string = self.redis_client.get("MediaSpiders:Facebook_Cookies").decode()
|
||
cookie_dict = form_cookie_dict(cookie_string) # 你已有此函数
|
||
|
||
# 转换为 Selenium 所需格式(必须含 domain 和 path)
|
||
cookies_to_add = []
|
||
for name, value in cookie_dict.items():
|
||
cookies_to_add.append({
|
||
'name': name,
|
||
'value': value,
|
||
'domain': '.facebook.com',
|
||
'path': '/',
|
||
'secure': True
|
||
})
|
||
|
||
for cookie in cookies_to_add:
|
||
try:
|
||
driver.add_cookie(cookie)
|
||
except Exception as e:
|
||
logger.warning(f"Failed to add cookie {cookie['name']}: {e}")
|
||
|
||
driver.refresh()
|
||
time.sleep(5)
|
||
|
||
# 获取待采集目标账号,并逐个请求
|
||
account_query_api = self.settings['SOCIAL_USER_QUERY_ALL_API']
|
||
account_query_api = account_query_api.format(sortBy="id", shuffleResult="true")
|
||
post_data = {
|
||
'userType': self.settings['FACEBOOK_USER_TYPE'],
|
||
'userFlag': 0
|
||
}
|
||
|
||
account_rsp = json.loads(
|
||
http_post(account_query_api, json.dumps(post_data), headers={"Content-Type": "application/json"}).text)
|
||
all_user_info = []
|
||
|
||
if account_rsp['code'] == 200:
|
||
all_user_info = account_rsp['content']
|
||
logger.info('GET %s users' % account_rsp['message'])
|
||
driver.set_window_size(1920, 1080)
|
||
for user_info in all_user_info:
|
||
uid = user_info['userUid']
|
||
current_url = 'https://www.facebook.com/%s' % uid
|
||
driver.get(current_url)
|
||
try:
|
||
ActionChains(driver).move_by_offset(200, 100).click().perform()
|
||
except Exception as e:
|
||
logger.warning("Exception: {},点击页面时有元素被遮挡".format(repr(e)))
|
||
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
|
||
time.sleep(5)
|
||
last_page_articles_count = 0
|
||
logger.info("Current URL: {}".format(current_url))
|
||
#
|
||
current_page_articles = driver.find_elements_by_xpath(
|
||
"//div[@data-pagelet='ProfileTimeline']/div[position() > %s]" % last_page_articles_count)
|
||
items = self.get_article(current_page_articles, uid, driver)
|
||
# 如果items为空,说明爬取到的是重复内容
|
||
for item in items:
|
||
if 'es_commentcount' in item and int(item['es_commentcount']) > 0:
|
||
self.comment_urls.append(
|
||
{'url': item['es_urlname'], 'article_id': item['es_sid'], 'article_author': item['es_authors'],
|
||
'article_text': item['es_urlcontent']})
|
||
logger.info("用户 {} 发文内容: {}".format(item['es_userid'], item['es_urlcontent']))
|
||
yield item
|
||
if self.crawl_comment and len(self.comment_urls) > 0:
|
||
comment_url = self.comment_urls.pop()
|
||
yield SeleniumRequest(url=comment_url['url'], callback=self.comment_parse,
|
||
meta={'article_id': comment_url['article_id']})
|
||
|
||
def get_article(self, articles, uid, browser):
|
||
browser.maximize_window()
|
||
article_items = []
|
||
for article in articles:
|
||
item = MediaspidersItem()
|
||
try:
|
||
# === 用户名:从 h2 下的 b/span 或直接 span 提取 ===
|
||
try:
|
||
uname = article.find_element_by_xpath('.//h2//b//span').text
|
||
except:
|
||
try:
|
||
uname = article.find_element_by_xpath('.//h2//span[@dir="auto"]').text
|
||
except:
|
||
uname = uid
|
||
|
||
# === 文章链接和时间:从包含 /posts/pfbid 的 a 标签提取 ===
|
||
post_link_elem = article.find_element_by_xpath(".//a[contains(@href,'/posts/pfbid')]")
|
||
article_url = post_link_elem.get_attribute('href')
|
||
article_url = article_url.split("?")[0]
|
||
article_time = post_link_elem.text # 时间文本直接在 a 标签内
|
||
|
||
# === 展开全文(如有)===
|
||
try:
|
||
clickable_fields = article.find_elements_by_xpath(".//div[@role='button']")
|
||
if len(clickable_fields) > 0:
|
||
for cf in clickable_fields:
|
||
cf_text = cf.text
|
||
if cf_text is not None and ("展开" in cf_text or "See more" in cf_text):
|
||
cf.click()
|
||
time.sleep(1)
|
||
break
|
||
except Exception as e:
|
||
logger.debug(repr(e))
|
||
|
||
# === 正文内容:使用 data-ad-rendering-role="story_message" ===
|
||
try:
|
||
article_text_lines = article.find_elements_by_xpath(
|
||
".//div[@data-ad-rendering-role='story_message']")
|
||
text_info = []
|
||
for line in article_text_lines:
|
||
text_info.append(line.text)
|
||
article_text = "".join(text_info)
|
||
except:
|
||
article_text = ""
|
||
|
||
# === 时间戳处理 ===
|
||
logger.info(f"article_time: {article_time}")
|
||
article_time = get_time_stamp(article_time)
|
||
logger.info(f"urltime: {article_time}")
|
||
|
||
# === 图片提取 ===
|
||
img_urls = []
|
||
try:
|
||
imgs = article.find_elements_by_xpath(".//a[contains(@href,'/photo/')]//img")
|
||
for img in imgs:
|
||
src = img.get_attribute("src")
|
||
if src and "emoji" not in src: # 过滤 emoji 图片
|
||
img_urls.append(src)
|
||
except:
|
||
pass
|
||
|
||
# === 视频(暂留空)===
|
||
video_urls = []
|
||
|
||
# === 互动数据:点赞、评论、转发 ===
|
||
like_count = 0
|
||
comment_count = 0
|
||
forward_count = 0
|
||
|
||
try:
|
||
# 点赞数:通过 aria-label 匹配
|
||
like_label_elem = article.find_element_by_xpath(
|
||
".//div[@aria-label and contains(@aria-label, '赞:')]")
|
||
like_label = like_label_elem.get_attribute("aria-label")
|
||
import re
|
||
like_match = re.search(r'(\d+)', like_label)
|
||
if like_match:
|
||
like_count = int(like_match.group(1))
|
||
except:
|
||
pass
|
||
|
||
try:
|
||
# 评论和转发:通常在 toolbar 内的两个 span 中
|
||
stat_spans = article.find_elements_by_xpath(
|
||
".//div[@role='toolbar']//span[@class='xt0b8zv x135b78x']")
|
||
if len(stat_spans) >= 2:
|
||
comment_count = int(stat_spans[0].text.replace(",", "")) if stat_spans[0].text.replace(",",
|
||
"").isdigit() else 0
|
||
forward_count = int(stat_spans[1].text.replace(",", "")) if stat_spans[1].text.replace(",",
|
||
"").isdigit() else 0
|
||
except:
|
||
logger.warning("获取点赞/评论/转发数量异常")
|
||
|
||
# === 填充 Item ===
|
||
article_id = get_str_md5(article_text)
|
||
item['es_sid'] = str(article_id)
|
||
item['es_hkey'] = str(article_id)
|
||
item['es_content'] = str(article_text).replace('查看翻译', '')
|
||
item['es_urlcontent'] = str(article_text).replace('查看翻译', '')
|
||
item['es_urltime'] = article_time
|
||
item['es_lasttime'] = get_current_timestamp()
|
||
item['es_loadtime'] = get_current_timestamp()
|
||
item['es_urltitle'] = uname
|
||
item['es_authors'] = uid
|
||
item['es_userid'] = uid
|
||
item['image_urls'] = img_urls
|
||
item['file_urls'] = video_urls
|
||
item['es_urlname'] = article_url
|
||
item['es_commentcount'] = comment_count
|
||
item['es_forwardcount'] = forward_count
|
||
item['es_likecount'] = like_count
|
||
item['es_sitename'] = 'facebook'
|
||
item['es_srcname'] = 'facebook'
|
||
item['es_carriertype'] = 'media'
|
||
|
||
# === 判重逻辑 ===
|
||
if item['es_lasttime'] - item['es_urltime'] > 48 * 3600 * 1000:
|
||
if self.bloom_filter.bfAdd(self.settings['FACEBOOK_FILTER_KEY'], article_id) <= 0:
|
||
logger.info("跳过已采集内容")
|
||
continue
|
||
|
||
if item['es_urlcontent'].endswith('展开'):
|
||
logger.info("跳过未展开的内容")
|
||
continue
|
||
|
||
article_items.append(item)
|
||
|
||
except Exception as e:
|
||
logger.debug("解析单条帖子失败: %s" % repr(e))
|
||
continue
|
||
|
||
logger.info("用户 {} 的发文数量为 {}".format(uid, len(article_items)))
|
||
return article_items
|
||
|
||
def comment_parse(self, response):
|
||
browser = response.request.meta['driver']
|
||
article_id = response.request.meta['article_id']
|
||
article_author = response.request.meta['article_id']
|
||
article_id = response.request.meta['article_id']
|
||
# 加载所有评论
|
||
browser.execute_script(
|
||
'window.scrollTo(0, document.body.scrollHeight)')
|
||
see_next = False
|
||
try:
|
||
see_next = browser.find_element_by_xpath(
|
||
'//div[contains(@id,"see_next")]')
|
||
except:
|
||
pass
|
||
while see_next:
|
||
body_height1 = browser.execute_script(
|
||
'return document.body.scrollHeight')
|
||
see_next.click()
|
||
time.sleep(1)
|
||
browser.execute_script(
|
||
'window.scrollTo(0, document.body.scrollHeight)')
|
||
body_height2 = browser.execute_script(
|
||
'return document.body.scrollHeight')
|
||
if body_height1 == body_height2:
|
||
break
|
||
try:
|
||
see_next = browser.find_element_by_xpath(
|
||
'//div[contains(@id,"see_next")]')
|
||
except:
|
||
see_next = False
|
||
# 获取所有评论
|
||
comment_elements = browser.find_elements_by_xpath(
|
||
'//div[@data-sigil="comment"]')
|
||
for comment_e in comment_elements:
|
||
comment_user_id = comment_e.get_attribute('id')
|
||
comment_user_name = comment_e.find_element_by_xpath(
|
||
'./div[2]/div/div//a | .//div[@data-sigil="comment-body"]/a').text
|
||
comment_body = comment_e.find_element_by_xpath(
|
||
'.//div[@data-sigil="comment-body"]')
|
||
comment_id = comment_body.get_attribute(
|
||
'data-commentid').split('_')[1]
|
||
comment_content = comment_body.text
|
||
comment_item = MediaspidersItem()
|
||
comment_item['es_authors'] = comment_user_name
|
||
comment_item['es_sid'] = comment_id
|
||
comment_item['es_content'] = comment_content
|
||
comment_item['es_hkey'] = str(article_id)
|
||
comment_item['es_catalog2'] = article_author
|
||
comment_item['es_sitename'] = 'facebook'
|
||
comment_item['es_srcname'] = 'facebook_comment'
|
||
comment_item['es_carriertype'] = 'facebook_comment'
|
||
comment_user_url_a = comment_e.find_element_by_xpath(
|
||
'./div[1]/div/a')
|
||
if comment_user_url_a:
|
||
comment_user_url = comment_user_url_a.get_attribute('href')
|
||
self.redis_client.sadd('URL_Filter:MediaSpiders:Facebook_Comment_User_Filter', comment_user_url)
|
||
yield comment_item
|
||
# 请求下一篇发文的评论
|
||
if len(self.comment_urls) > 0:
|
||
comment_url = self.comment_urls.pop()
|
||
yield SeleniumRequest(url=comment_url['url'], callback=self.comment_parse,
|
||
meta={'article_id': comment_url['article_id']})
|