针对6s采集的改动
This commit is contained in:
parent
d3a46db615
commit
dfb65f11bf
@ -110,8 +110,8 @@ CUSTOM_USER_AGENT = [
|
|||||||
|
|
||||||
# 部署在外网采集fb时使用selenium_chrome
|
# 部署在外网采集fb时使用selenium_chrome
|
||||||
SELENIUM_DRIVER_NAME = 'chrome'
|
SELENIUM_DRIVER_NAME = 'chrome'
|
||||||
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
||||||
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
||||||
SELENIUM_DRIVER_ARGUMENTS = [
|
SELENIUM_DRIVER_ARGUMENTS = [
|
||||||
'--headless',
|
'--headless',
|
||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
|
|||||||
@ -1,24 +1,17 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import json
|
import json
|
||||||
import logging as logger
|
import logging as logger
|
||||||
import random
|
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
from urllib import parse
|
from urllib import parse
|
||||||
|
|
||||||
import redis
|
import redis
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_selenium import SeleniumRequest
|
from scrapy_selenium import SeleniumRequest
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
from MediaSpiders.items import MediaspidersItem
|
from MediaSpiders.items import MediaspidersItem
|
||||||
from MediaSpiders.utils.http_utils import http_post
|
from MediaSpiders.utils.http_utils import http_post
|
||||||
from MediaSpiders.utils.login_utils import login
|
from MediaSpiders.utils.login_utils import login
|
||||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
|
||||||
|
|
||||||
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
||||||
|
|
||||||
|
|
||||||
@ -42,8 +35,8 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||||
'FILES_STORE': r'/usr/local/videos',
|
'FILES_STORE': r'/usr/local/videos',
|
||||||
'FILES_RESULT_FIELD': 'es_video',
|
'FILES_RESULT_FIELD': 'es_video',
|
||||||
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称
|
||||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称
|
||||||
'ITEM_PIPELINES': {
|
'ITEM_PIPELINES': {
|
||||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||||
'scrapy.pipelines.files.FilesPipeline': 1,
|
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||||
@ -79,10 +72,19 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||||
password=self.settings['REDIS_PWD'])
|
password=self.settings['REDIS_PWD'])
|
||||||
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
||||||
cookie_string = None
|
|
||||||
# 获取采集登录账号并登录
|
# 获取采集登录账号并登录
|
||||||
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
||||||
# 尝试自动化登录网页获取 cookies,若失败则从redis中 使用已有cookies
|
# 从redis中 使用已有cookies,否则自动化登录网页获取cookies
|
||||||
|
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
||||||
|
ct0 = None
|
||||||
|
if cookie_string:
|
||||||
|
self.cookie_dict = form_cookie_dict(cookie_string)
|
||||||
|
# 5. 构建 headers
|
||||||
|
ct0 = self.cookie_dict.get('ct0')
|
||||||
|
if not ct0:
|
||||||
|
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
||||||
|
return
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
|
|
||||||
driver = login().login_with_selenium(
|
driver = login().login_with_selenium(
|
||||||
@ -98,13 +100,6 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
self.cookie_dict[cookie['name']] = cookie['value']
|
self.cookie_dict[cookie['name']] = cookie['value']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.info("自动化获取cookies失败")
|
logger.info("自动化获取cookies失败")
|
||||||
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
|
||||||
self.cookie_dict = form_cookie_dict(cookie_string)
|
|
||||||
# 5. 构建 headers
|
|
||||||
ct0 = self.cookie_dict.get('ct0')
|
|
||||||
if not ct0:
|
|
||||||
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
|
||||||
return
|
|
||||||
|
|
||||||
self.header = {
|
self.header = {
|
||||||
'Host': 'api.twitter.com',
|
'Host': 'api.twitter.com',
|
||||||
@ -137,7 +132,7 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
||||||
meta={
|
meta={
|
||||||
'uid': user_info['userUid'],
|
'uid': user_info['userUid'],
|
||||||
'proxy': 'http://127.0.0.1:10809',
|
# 'proxy': 'http://127.0.0.1:10808',
|
||||||
'currentCount': 0
|
'currentCount': 0
|
||||||
},
|
},
|
||||||
cookies=self.cookie_dict, headers=self.header)
|
cookies=self.cookie_dict, headers=self.header)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user