针对6s采集的改动

This commit is contained in:
yuxin-pc 2026-02-12 09:35:23 +08:00
parent d3a46db615
commit dfb65f11bf
2 changed files with 26 additions and 31 deletions

View File

@ -110,8 +110,8 @@ CUSTOM_USER_AGENT = [
# 部署在外网采集fb时使用selenium_chrome
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
SELENIUM_DRIVER_ARGUMENTS = [
'--headless',
'--no-sandbox',

View File

@ -1,24 +1,17 @@
# -*- coding: utf-8 -*-
import json
import logging as logger
import random
import re
import time
from urllib import parse
import redis
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.login_utils import login
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
from selenium.webdriver.common.action_chains import ActionChains
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
@ -42,8 +35,8 @@ class TwitterSpider(scrapy.Spider):
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称
'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1,
@ -79,32 +72,34 @@ class TwitterSpider(scrapy.Spider):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
cookie_string = None
# 获取采集登录账号并登录
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
# 尝试自动化登录网页获取 cookies若失败则从redis中 使用已有cookies
try:
driver = login().login_with_selenium(
'https://x.com/i/flow/login',
self.name,
login_users=login_users,
response=response
)
cookies = driver.get_cookies()
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token
self.cookie_dict = {}
for cookie in cookies:
self.cookie_dict[cookie['name']] = cookie['value']
except Exception as e:
logger.info("自动化获取cookies失败")
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
# 从redis中 使用已有cookies否则自动化登录网页获取cookies
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
ct0 = None
if cookie_string:
self.cookie_dict = form_cookie_dict(cookie_string)
# 5. 构建 headers
ct0 = self.cookie_dict.get('ct0')
if not ct0:
logger.error("redis中cookie缺失ct0 (CSRF token)")
return
else:
try:
driver = login().login_with_selenium(
'https://x.com/i/flow/login',
self.name,
login_users=login_users,
response=response
)
cookies = driver.get_cookies()
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token
self.cookie_dict = {}
for cookie in cookies:
self.cookie_dict[cookie['name']] = cookie['value']
except Exception as e:
logger.info("自动化获取cookies失败")
self.header = {
'Host': 'api.twitter.com',
@ -137,7 +132,7 @@ class TwitterSpider(scrapy.Spider):
yield scrapy.Request(url=graphql_url, callback=self.parse,
meta={
'uid': user_info['userUid'],
'proxy': 'http://127.0.0.1:10809',
# 'proxy': 'http://127.0.0.1:10808',
'currentCount': 0
},
cookies=self.cookie_dict, headers=self.header)