From dfb65f11bf81ca36b89143ce08fae48c02830547 Mon Sep 17 00:00:00 2001 From: yuxin-pc Date: Thu, 12 Feb 2026 09:35:23 +0800 Subject: [PATCH] =?UTF-8?q?=E9=92=88=E5=AF=B96s=E9=87=87=E9=9B=86=E7=9A=84?= =?UTF-8?q?=E6=94=B9=E5=8A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spiders/MediaSpiders/MediaSpiders/settings.py | 4 +- .../MediaSpiders/spiders/TwitterUserSpider.py | 53 +++++++++---------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/spiders/MediaSpiders/MediaSpiders/settings.py b/spiders/MediaSpiders/MediaSpiders/settings.py index fa726d5..7215e25 100644 --- a/spiders/MediaSpiders/MediaSpiders/settings.py +++ b/spiders/MediaSpiders/MediaSpiders/settings.py @@ -110,8 +110,8 @@ CUSTOM_USER_AGENT = [ # 部署在外网采集fb时使用selenium_chrome SELENIUM_DRIVER_NAME = 'chrome' -SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' -# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' +# SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' +SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' SELENIUM_DRIVER_ARGUMENTS = [ '--headless', '--no-sandbox', diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py index 4b1102f..a60fe86 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py @@ -1,24 +1,17 @@ # -*- coding: utf-8 -*- import json import logging as logger -import random import re -import time from urllib import parse import redis import scrapy from scrapy_selenium import SeleniumRequest -from selenium import webdriver -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By + from MediaSpiders.items import MediaspidersItem from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.login_utils import login from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp -from selenium.webdriver.common.action_chains import ActionChains - from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation @@ -42,8 +35,8 @@ class TwitterSpider(scrapy.Spider): 'IMAGES_RESULT_FIELD': 'es_urlimage', 'FILES_STORE': r'/usr/local/videos', 'FILES_RESULT_FIELD': 'es_video', - 'ZIP_FILE_NAME': 'image_data_publicinfo_', - 'FILE_ZIP_FILE_NAME': 'image_data_plane_', + 'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称 + 'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称 'ITEM_PIPELINES': { 'scrapy.pipelines.images.ImagesPipeline': 2, 'scrapy.pipelines.files.FilesPipeline': 1, @@ -79,32 +72,34 @@ class TwitterSpider(scrapy.Spider): self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], password=self.settings['REDIS_PWD']) self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY'] - cookie_string = None # 获取采集登录账号并登录 login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts') - # 尝试自动化登录网页获取 cookies,若失败则从redis中 使用已有cookies - try: - - driver = login().login_with_selenium( - 'https://x.com/i/flow/login', - self.name, - login_users=login_users, - response=response - ) - cookies = driver.get_cookies() - # 取cookie中的ct0为x-csrf-token,取gt为x-guest-token - self.cookie_dict = {} - for cookie in cookies: - self.cookie_dict[cookie['name']] = cookie['value'] - except Exception as e: - logger.info("自动化获取cookies失败") - cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode() + # 从redis中 使用已有cookies,否则自动化登录网页获取cookies + cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode() + ct0 = None + if cookie_string: self.cookie_dict = form_cookie_dict(cookie_string) # 5. 构建 headers ct0 = self.cookie_dict.get('ct0') if not ct0: logger.error("redis中cookie缺失ct0 (CSRF token)!") return + else: + try: + + driver = login().login_with_selenium( + 'https://x.com/i/flow/login', + self.name, + login_users=login_users, + response=response + ) + cookies = driver.get_cookies() + # 取cookie中的ct0为x-csrf-token,取gt为x-guest-token + self.cookie_dict = {} + for cookie in cookies: + self.cookie_dict[cookie['name']] = cookie['value'] + except Exception as e: + logger.info("自动化获取cookies失败") self.header = { 'Host': 'api.twitter.com', @@ -137,7 +132,7 @@ class TwitterSpider(scrapy.Spider): yield scrapy.Request(url=graphql_url, callback=self.parse, meta={ 'uid': user_info['userUid'], - 'proxy': 'http://127.0.0.1:10809', + # 'proxy': 'http://127.0.0.1:10808', 'currentCount': 0 }, cookies=self.cookie_dict, headers=self.header)