针对6s采集的改动
This commit is contained in:
parent
d3a46db615
commit
dfb65f11bf
@ -110,8 +110,8 @@ CUSTOM_USER_AGENT = [
|
||||
|
||||
# 部署在外网采集fb时使用selenium_chrome
|
||||
SELENIUM_DRIVER_NAME = 'chrome'
|
||||
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
||||
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
||||
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
||||
SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
||||
SELENIUM_DRIVER_ARGUMENTS = [
|
||||
'--headless',
|
||||
'--no-sandbox',
|
||||
|
||||
@ -1,24 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import logging as logger
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from urllib import parse
|
||||
|
||||
import redis
|
||||
import scrapy
|
||||
from scrapy_selenium import SeleniumRequest
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from MediaSpiders.items import MediaspidersItem
|
||||
from MediaSpiders.utils.http_utils import http_post
|
||||
from MediaSpiders.utils.login_utils import login
|
||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
|
||||
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
||||
|
||||
|
||||
@ -42,8 +35,8 @@ class TwitterSpider(scrapy.Spider):
|
||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||
'FILES_STORE': r'/usr/local/videos',
|
||||
'FILES_RESULT_FIELD': 'es_video',
|
||||
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||
'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称
|
||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称
|
||||
'ITEM_PIPELINES': {
|
||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||
@ -79,32 +72,34 @@ class TwitterSpider(scrapy.Spider):
|
||||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||
password=self.settings['REDIS_PWD'])
|
||||
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
||||
cookie_string = None
|
||||
# 获取采集登录账号并登录
|
||||
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
||||
# 尝试自动化登录网页获取 cookies,若失败则从redis中 使用已有cookies
|
||||
try:
|
||||
|
||||
driver = login().login_with_selenium(
|
||||
'https://x.com/i/flow/login',
|
||||
self.name,
|
||||
login_users=login_users,
|
||||
response=response
|
||||
)
|
||||
cookies = driver.get_cookies()
|
||||
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
||||
self.cookie_dict = {}
|
||||
for cookie in cookies:
|
||||
self.cookie_dict[cookie['name']] = cookie['value']
|
||||
except Exception as e:
|
||||
logger.info("自动化获取cookies失败")
|
||||
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
||||
# 从redis中 使用已有cookies,否则自动化登录网页获取cookies
|
||||
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
||||
ct0 = None
|
||||
if cookie_string:
|
||||
self.cookie_dict = form_cookie_dict(cookie_string)
|
||||
# 5. 构建 headers
|
||||
ct0 = self.cookie_dict.get('ct0')
|
||||
if not ct0:
|
||||
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
||||
return
|
||||
else:
|
||||
try:
|
||||
|
||||
driver = login().login_with_selenium(
|
||||
'https://x.com/i/flow/login',
|
||||
self.name,
|
||||
login_users=login_users,
|
||||
response=response
|
||||
)
|
||||
cookies = driver.get_cookies()
|
||||
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
||||
self.cookie_dict = {}
|
||||
for cookie in cookies:
|
||||
self.cookie_dict[cookie['name']] = cookie['value']
|
||||
except Exception as e:
|
||||
logger.info("自动化获取cookies失败")
|
||||
|
||||
self.header = {
|
||||
'Host': 'api.twitter.com',
|
||||
@ -137,7 +132,7 @@ class TwitterSpider(scrapy.Spider):
|
||||
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
||||
meta={
|
||||
'uid': user_info['userUid'],
|
||||
'proxy': 'http://127.0.0.1:10809',
|
||||
# 'proxy': 'http://127.0.0.1:10808',
|
||||
'currentCount': 0
|
||||
},
|
||||
cookies=self.cookie_dict, headers=self.header)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user