针对6s采集的改动

This commit is contained in:
yuxin-pc 2026-02-12 09:35:23 +08:00
parent d3a46db615
commit dfb65f11bf
2 changed files with 26 additions and 31 deletions

View File

@ -110,8 +110,8 @@ CUSTOM_USER_AGENT = [
# 部署在外网采集fb时使用selenium_chrome # 部署在外网采集fb时使用selenium_chrome
SELENIUM_DRIVER_NAME = 'chrome' SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' # SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
SELENIUM_DRIVER_ARGUMENTS = [ SELENIUM_DRIVER_ARGUMENTS = [
'--headless', '--headless',
'--no-sandbox', '--no-sandbox',

View File

@ -1,24 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import logging as logger import logging as logger
import random
import re import re
import time
from urllib import parse from urllib import parse
import redis import redis
import scrapy import scrapy
from scrapy_selenium import SeleniumRequest from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from MediaSpiders.items import MediaspidersItem from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.login_utils import login from MediaSpiders.utils.login_utils import login
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
from selenium.webdriver.common.action_chains import ActionChains
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
@ -42,8 +35,8 @@ class TwitterSpider(scrapy.Spider):
'IMAGES_RESULT_FIELD': 'es_urlimage', 'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos', 'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video', 'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_', 'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称
'FILE_ZIP_FILE_NAME': 'image_data_plane_', 'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称
'ITEM_PIPELINES': { 'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2, 'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1, 'scrapy.pipelines.files.FilesPipeline': 1,
@ -79,32 +72,34 @@ class TwitterSpider(scrapy.Spider):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD']) password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY'] self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
cookie_string = None
# 获取采集登录账号并登录 # 获取采集登录账号并登录
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts') login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
# 尝试自动化登录网页获取 cookies若失败则从redis中 使用已有cookies # 从redis中 使用已有cookies否则自动化登录网页获取cookies
try: cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
ct0 = None
driver = login().login_with_selenium( if cookie_string:
'https://x.com/i/flow/login',
self.name,
login_users=login_users,
response=response
)
cookies = driver.get_cookies()
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token
self.cookie_dict = {}
for cookie in cookies:
self.cookie_dict[cookie['name']] = cookie['value']
except Exception as e:
logger.info("自动化获取cookies失败")
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
self.cookie_dict = form_cookie_dict(cookie_string) self.cookie_dict = form_cookie_dict(cookie_string)
# 5. 构建 headers # 5. 构建 headers
ct0 = self.cookie_dict.get('ct0') ct0 = self.cookie_dict.get('ct0')
if not ct0: if not ct0:
logger.error("redis中cookie缺失ct0 (CSRF token)") logger.error("redis中cookie缺失ct0 (CSRF token)")
return return
else:
try:
driver = login().login_with_selenium(
'https://x.com/i/flow/login',
self.name,
login_users=login_users,
response=response
)
cookies = driver.get_cookies()
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token
self.cookie_dict = {}
for cookie in cookies:
self.cookie_dict[cookie['name']] = cookie['value']
except Exception as e:
logger.info("自动化获取cookies失败")
self.header = { self.header = {
'Host': 'api.twitter.com', 'Host': 'api.twitter.com',
@ -137,7 +132,7 @@ class TwitterSpider(scrapy.Spider):
yield scrapy.Request(url=graphql_url, callback=self.parse, yield scrapy.Request(url=graphql_url, callback=self.parse,
meta={ meta={
'uid': user_info['userUid'], 'uid': user_info['userUid'],
'proxy': 'http://127.0.0.1:10809', # 'proxy': 'http://127.0.0.1:10808',
'currentCount': 0 'currentCount': 0
}, },
cookies=self.cookie_dict, headers=self.header) cookies=self.cookie_dict, headers=self.header)