Compare commits
2 Commits
91d3f484f0
...
488bc2fdca
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
488bc2fdca | ||
|
|
a69ff25ce4 |
@ -1,4 +1,4 @@
|
|||||||
# -*- coding: utf-8 -*-
|
l# -*- coding: utf-8 -*-
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import platform
|
import platform
|
||||||
|
|||||||
@ -59,14 +59,23 @@ class SeleniumMiddleware:
|
|||||||
# Edge in headless mode
|
# Edge in headless mode
|
||||||
edge_options = EdgeOptions()
|
edge_options = EdgeOptions()
|
||||||
edge_options.use_chromium = True
|
edge_options.use_chromium = True
|
||||||
self.driver = Edge(executable_path='msedgedriver.exe', options=edge_options)
|
# edge_options.add_argument("--headless")
|
||||||
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
# 隐藏“受自动化软件控制”提示栏
|
||||||
"source": """
|
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
Object.defineProperty(navigator, 'webdriver', {
|
# 禁用自动化扩展
|
||||||
get: () => undefined
|
edge_options.add_experimental_option('useAutomationExtension', False)
|
||||||
})
|
edge_options.add_argument(
|
||||||
"""
|
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0")
|
||||||
})
|
edge_options.add_argument("--window-size=1920,1080")
|
||||||
|
# 设置浏览器的 高级偏好设置
|
||||||
|
prefs = {
|
||||||
|
# "profile.managed_default_content_settings.images": 2, # 禁用图片加载:2 表示“禁止”,1 表示“允许”
|
||||||
|
"credentials_enable_service": False, # 禁用保存密码提示
|
||||||
|
"profile.password_manager_enabled": False # 禁用密码管理器
|
||||||
|
}
|
||||||
|
edge_options.add_experimental_option("prefs", prefs)
|
||||||
|
|
||||||
|
self.driver = Edge(executable_path="C:/Users/DELL/Downloads/edgedriver_win64/msedgedriver.exe", options=edge_options)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler):
|
def from_crawler(cls, crawler):
|
||||||
@ -100,7 +109,7 @@ class SeleniumMiddleware:
|
|||||||
self.proxy_count = 0
|
self.proxy_count = 0
|
||||||
ip = request.meta['proxy'].split(':')[1][2:]
|
ip = request.meta['proxy'].split(':')[1][2:]
|
||||||
port = int(request.meta['proxy'].split(':')[2])
|
port = int(request.meta['proxy'].split(':')[2])
|
||||||
user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
|
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0'
|
||||||
self.driver.get("about:config")
|
self.driver.get("about:config")
|
||||||
script = '''
|
script = '''
|
||||||
var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
|
var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
BOT_NAME = 'MediaSpiders'
|
BOT_NAME = 'MediaSpiders'
|
||||||
|
|
||||||
LOG_LEVEL = 'INFO'
|
LOG_LEVEL = 'DEBUG'
|
||||||
|
|
||||||
SPIDER_MODULES = ['MediaSpiders.spiders']
|
SPIDER_MODULES = ['MediaSpiders.spiders']
|
||||||
NEWSPIDER_MODULE = 'MediaSpiders.spiders'
|
NEWSPIDER_MODULE = 'MediaSpiders.spiders'
|
||||||
@ -58,10 +58,16 @@ FLICKR_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Flickr_Filter'
|
|||||||
|
|
||||||
WECHAT_LINKS_KEY = "MediaSpiders:Wechat_links"
|
WECHAT_LINKS_KEY = "MediaSpiders:Wechat_links"
|
||||||
|
|
||||||
|
# TWITTER_BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" # old
|
||||||
|
TWITTER_API_KEY = "JFY7dt"
|
||||||
|
TWITTER_BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAO8MTQEAAAAAQWidbP34N0nykDnUEDweEpyRgsc%3Dxt0hX1whV1hlmbMsStkB7ZU3pjXOINOCh2DMPoIAwljwrOWgvE"
|
||||||
|
TWITTER_ACCESS_TOKEN = "1294829483816398849-gscLJCEF9ZObZJikjCmjXtxoW6YVWu"
|
||||||
|
TWITTER_ACCESS_TOKEN_SECRET = "1XvTHZXzN0JBQulTBOvCTgXVPzVGYWe50zH1r4qXLper3"
|
||||||
|
|
||||||
SOCIAL_USER_QUERY_ALL_API = "http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy={sortBy}&shuffleResult={shuffleResult}"
|
SOCIAL_USER_QUERY_ALL_API = "http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy={sortBy}&shuffleResult={shuffleResult}"
|
||||||
SOCIAL_USER_UPDATE_API = "http://47.115.228.133:28081/api/open/target/social/update"
|
SOCIAL_USER_UPDATE_API = "http://47.115.228.133:28081/api/open/target/social/update"
|
||||||
WEIBO_USER_TYPE = 0
|
WEIBO_USER_TYPE = 0
|
||||||
TWITTER_USER_TYPE = 1
|
TWITTER_USER_TYPE = 21
|
||||||
FACEBOOK_USER_TYPE = 2
|
FACEBOOK_USER_TYPE = 2
|
||||||
YOUTUBE_USER_TYPE = 3
|
YOUTUBE_USER_TYPE = 3
|
||||||
FLICKR_USER_TYPE = 4
|
FLICKR_USER_TYPE = 4
|
||||||
@ -73,8 +79,25 @@ TWITTER_URL_KEY = 'MediaSpiders:Twitter_URL_Key'
|
|||||||
TWITTER_PID_KEY = ''
|
TWITTER_PID_KEY = ''
|
||||||
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
|
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
|
||||||
|
|
||||||
|
# CUSTOM_USER_AGENT = [
|
||||||
|
# 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
|
||||||
|
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
||||||
|
# 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
||||||
|
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
|
||||||
|
# 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
||||||
|
# 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
|
||||||
|
# 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
|
||||||
|
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
|
||||||
|
# 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
|
||||||
|
# ]
|
||||||
|
|
||||||
CUSTOM_USER_AGENT = [
|
CUSTOM_USER_AGENT = [
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.1958',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 OPR/117.0.0.',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.3'
|
||||||
]
|
]
|
||||||
|
|
||||||
# 部署在外网采集fb时使用selenium_chrome
|
# 部署在外网采集fb时使用selenium_chrome
|
||||||
@ -86,8 +109,7 @@ SELENIUM_DRIVER_ARGUMENTS = [
|
|||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
'--disable-dev-shm-usage',
|
'--disable-dev-shm-usage',
|
||||||
'--disable-gpu',
|
'--disable-gpu',
|
||||||
'--window-size=1920,1080',
|
'--window-size=1920,1080'
|
||||||
'--disable-blink-features=AutomationControlled'
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# 本地调试用
|
# 本地调试用
|
||||||
|
|||||||
@ -9,10 +9,23 @@ from urllib import parse
|
|||||||
import redis
|
import redis
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_selenium import SeleniumRequest
|
from scrapy_selenium import SeleniumRequest
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
from MediaSpiders.items import MediaspidersItem
|
from MediaSpiders.items import MediaspidersItem
|
||||||
from MediaSpiders.utils.http_utils import http_post
|
from MediaSpiders.utils.http_utils import http_post
|
||||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
|
||||||
|
|
||||||
|
def form_cookie_dict(cookie_string):
|
||||||
|
cookie_string_list = cookie_string.split(';')
|
||||||
|
cookie_dict = {}
|
||||||
|
for cookie in cookie_string_list:
|
||||||
|
key = cookie.split('=')[0].replace(' ', '')
|
||||||
|
cookie_dict[key] = cookie.split('=')[1]
|
||||||
|
return cookie_dict
|
||||||
|
|
||||||
|
|
||||||
class TwitterSpider(scrapy.Spider):
|
class TwitterSpider(scrapy.Spider):
|
||||||
@ -38,6 +51,7 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
# 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
# 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
||||||
# 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
# 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
||||||
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700,
|
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700,
|
||||||
|
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -63,58 +77,125 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
password=self.settings['REDIS_PWD'])
|
password=self.settings['REDIS_PWD'])
|
||||||
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
||||||
logger.info("login twitter")
|
logger.info("login twitter")
|
||||||
driver = response.request.meta['driver']
|
cookie_string = None
|
||||||
driver.maximize_window()
|
# 尝试自动化登录网页获取 cookies,若失败则从redis中 使用已有cookies
|
||||||
driver.get('https://x.com/i/flow/login')
|
|
||||||
time.sleep(5)
|
|
||||||
# 获取采集登录账号并登录
|
|
||||||
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
|
||||||
user_list = []
|
|
||||||
for u in login_users:
|
|
||||||
user_list.append(json.loads(u.decode()))
|
|
||||||
login_user = random.choice(user_list)
|
|
||||||
logger.info(f"login as user {login_user['uid']}")
|
|
||||||
driver.find_element_by_xpath("//input").send_keys(login_user['uid'])
|
|
||||||
try:
|
try:
|
||||||
next_button = driver.find_element_by_xpath("//div[@role='button'][2]")
|
driver = response.request.meta['driver']
|
||||||
next_button.click()
|
# 隐藏指纹
|
||||||
except Exception:
|
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||||
logger.info("点击“下一步”的button元素")
|
'source': '''
|
||||||
next_button = driver.find_element_by_xpath("//button[@role='button'][2]")
|
delete navigator.__proto__.webdriver;
|
||||||
next_button.click()
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
time.sleep(5)
|
'''
|
||||||
try:
|
})
|
||||||
logger.info("输入手机号验证...")
|
|
||||||
driver.find_element_by_xpath("//input[@name='text']").send_keys("+8619962025641")
|
driver.maximize_window()
|
||||||
driver.find_element_by_xpath("//button[@data-testid='ocfEnterTextNextButton']").click()
|
# 1. 打开第一个标签页
|
||||||
|
driver.get('https://x.com/i/flow/login')
|
||||||
|
wait = WebDriverWait(driver, 15)
|
||||||
|
# 2. 通过 JS 打开第二个标签页(新 Tab)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
except Exception:
|
driver.execute_script("window.open('');")
|
||||||
logger.info("无需输入手机号验证")
|
driver.execute_script("window.open('https://x.com/i/flow/login', '_blank');")
|
||||||
driver.find_element_by_xpath("//input[@name='password']").send_keys(login_user['pwd'])
|
|
||||||
driver.find_element_by_xpath("//button[@data-testid='LoginForm_Login_Button']").click()
|
# 3. 获取所有标签页句柄
|
||||||
time.sleep(5)
|
handles = driver.window_handles # [handle1, handle2]
|
||||||
try:
|
|
||||||
driver.find_element_by_xpath("//button[@data-testid='confirmationSheetConfirm']").click()
|
# 4. 切换到第二个标签页(可选)
|
||||||
time.sleep(10)
|
driver.switch_to.window(handles[1])
|
||||||
except:
|
# 获取采集登录账号并登录
|
||||||
time.sleep(5)
|
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
||||||
cookies = driver.get_cookies()
|
user_list = []
|
||||||
# cookies = json.loads(response.text)['cookies']
|
for u in login_users:
|
||||||
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
user_list.append(json.loads(u.decode()))
|
||||||
self.cookie_dict = {}
|
|
||||||
for cookie in cookies:
|
login_user = random.choice(user_list)
|
||||||
self.cookie_dict[cookie['name']] = cookie['value']
|
logger.info(f"login as user {login_user['uid']}")
|
||||||
|
# time.sleep(random.uniform(1.5, 3.0))
|
||||||
|
# driver.find_element_by_xpath("//input").send_keys(login_user['uid'])
|
||||||
|
# 等待并定位用户名输入框
|
||||||
|
username_input = wait.until(
|
||||||
|
EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[autocomplete="username"]'))
|
||||||
|
)
|
||||||
|
|
||||||
|
# 模拟真人逐字输入(带随机延迟)
|
||||||
|
username = login_user['uid']
|
||||||
|
for char in username:
|
||||||
|
username_input.send_keys(char)
|
||||||
|
time.sleep(random.uniform(0.05, 0.2)) # 每个字符间隔 50~200ms
|
||||||
|
|
||||||
|
time.sleep(random.uniform(0.3, 0.8)) # 输入完后稍作停顿
|
||||||
|
|
||||||
|
# 尝试点击 "Next" 按钮(主逻辑:带文本的按钮)
|
||||||
|
try:
|
||||||
|
next_button = wait.until(
|
||||||
|
EC.element_to_be_clickable(
|
||||||
|
(By.XPATH, "//button[.//span[contains(text(), 'Next') or contains(text(), '下一步')]]")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
body = driver.find_element(By.TAG_NAME, "body")
|
||||||
|
ActionChains(driver).move_to_element_with_offset(body, 100, 100).perform()
|
||||||
|
time.sleep(0.5)
|
||||||
|
# 模拟鼠标移动到按钮并点击
|
||||||
|
actions = ActionChains(driver)
|
||||||
|
actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.info("主 Next 按钮未找到,尝试备用定位方式")
|
||||||
|
try:
|
||||||
|
# 备用:通过 role 定位第二个 button
|
||||||
|
next_button = driver.find_element(By.XPATH, "//button[@role='button'][2]")
|
||||||
|
actions = ActionChains(driver)
|
||||||
|
actions.move_to_element(next_button).pause(random.uniform(0.2, 0.6)).click().perform()
|
||||||
|
except Exception as e2:
|
||||||
|
logger.error(f"两种方式均无法点击 Next 按钮: {e}, {e2}")
|
||||||
|
raise
|
||||||
|
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
|
||||||
|
try:
|
||||||
|
logger.info("输入手机号验证...")
|
||||||
|
driver.find_element_by_xpath("//input[@name='text']").send_keys("+8619962025641")
|
||||||
|
# driver.find_element_by_xpath("//button[@data-testid='ocfEnterTextNextButton']").click()
|
||||||
|
driver.find_element_by_xpath(driver.find_element_by_xpath("//button[.//span[text()='下一步']]")).click()
|
||||||
|
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
|
||||||
|
except Exception:
|
||||||
|
logger.info("无需输入手机号验证")
|
||||||
|
driver.find_element_by_xpath("//input[@name='password']").send_keys(login_user['pwd'])
|
||||||
|
driver.find_element_by_xpath("//button[@data-testid='LoginForm_Login_Button']").click()
|
||||||
|
time.sleep(random.uniform(1.5, 5.0)) # 等待页面加载
|
||||||
|
try:
|
||||||
|
driver.find_element_by_xpath("//button[@data-testid='confirmationSheetConfirm']").click()
|
||||||
|
time.sleep(random.uniform(1.5, 10.0)) # 等待页面加载
|
||||||
|
except:
|
||||||
|
time.sleep(5)
|
||||||
|
cookies = driver.get_cookies()
|
||||||
|
# cookies = json.loads(response.text)['cookies']
|
||||||
|
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
||||||
|
self.cookie_dict = {}
|
||||||
|
for cookie in cookies:
|
||||||
|
self.cookie_dict[cookie['name']] = cookie['value']
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.info("自动化获取cookies失败")
|
||||||
|
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
||||||
|
self.cookie_dict = form_cookie_dict(cookie_string)
|
||||||
|
# 5. 构建 headers
|
||||||
|
ct0 = self.cookie_dict.get('ct0')
|
||||||
|
if not ct0:
|
||||||
|
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
||||||
|
return
|
||||||
|
|
||||||
self.header = {
|
self.header = {
|
||||||
'Host': 'api.twitter.com',
|
'Host': 'api.twitter.com',
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
|
||||||
'Accept': '*/*',
|
'Accept': '*/*',
|
||||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||||
'content-type': 'application/json',
|
'content-type': 'application/json',
|
||||||
'authorization': self.authorization,
|
'authorization': self.authorization,
|
||||||
# 'x-twitter-active-user': 'yes',
|
# 'x-twitter-active-user': 'yes',
|
||||||
'Origin': 'https://twitter.com',
|
'Origin': 'https://twitter.com',
|
||||||
'Connection': 'keep-alive',
|
'Cookie': cookie_string,
|
||||||
'X-Csrf-Token': self.cookie_dict['ct0']
|
# 'Connection': 'keep-alive',
|
||||||
|
'X-Csrf-Token': ct0
|
||||||
}
|
}
|
||||||
self.filter_key = self.settings['TWITTER_FILTER_KEY']
|
self.filter_key = self.settings['TWITTER_FILTER_KEY']
|
||||||
self.pid_key = self.settings['TWITTER_PID_KEY']
|
self.pid_key = self.settings['TWITTER_PID_KEY']
|
||||||
@ -131,9 +212,14 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
if account_rsp['code'] == 200:
|
if account_rsp['code'] == 200:
|
||||||
all_user_info = account_rsp['content']
|
all_user_info = account_rsp['content']
|
||||||
for user_info in all_user_info:
|
for user_info in all_user_info:
|
||||||
graphql_url = f'https://twitter.com/i/api/graphql/{url_key}/UserTweets?variables=%7B%22userId%22%3A%22{user_info["userUid"]}%22%2C%22count%22%3A20%2C%22includePromotedContent%22%3Atrue%2C%22withQuickPromoteEligibilityTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%2C%22withV2Timeline%22%3Atrue%7D&features=%7B%22rweb_lists_timeline_redesign_enabled%22%3Atrue%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22tweetypie_unmention_optimization_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3Atrue%2C%22view_counts_everywhere_api_enabled%22%3Atrue%2C%22longform_notetweets_consumption_enabled%22%3Atrue%2C%22responsive_web_twitter_article_tweet_consumption_enabled%22%3Afalse%2C%22tweet_awards_web_tipping_enabled%22%3Afalse%2C%22freedom_of_speech_not_reach_fetch_enabled%22%3Atrue%2C%22standardized_nudges_misinfo%22%3Atrue%2C%22tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled%22%3Atrue%2C%22longform_notetweets_rich_text_read_enabled%22%3Atrue%2C%22longform_notetweets_inline_media_enabled%22%3Atrue%2C%22responsive_web_media_download_video_enabled%22%3Afalse%2C%22responsive_web_enhance_cards_enabled%22%3Afalse%7D&fieldToggles=%7B%22withArticleRichContentState%22%3Afalse%7D'
|
graphql_url = f'https://x.com/i/api/graphql/{url_key}/UserTweets?variables=%7B%22userId%22%3A%22{user_info["userUid"]}%22%2C%22count%22%3A20%2C%22includePromotedContent%22%3Atrue%2C%22withQuickPromoteEligibilityTweetFields%22%3Atrue%2C%22withVoice%22%3Atrue%7D&features=%7B%22rweb_video_screen_enabled%22%3Afalse%2C%22profile_label_improvements_pcf_label_in_post_enabled%22%3Atrue%2C%22responsive_web_profile_redirect_enabled%22%3Afalse%2C%22rweb_tipjar_consumption_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Atrue%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22premium_content_api_read_enabled%22%3Afalse%2C%22communities_web_enable_tweet_community_results_fetch%22%3Atrue%2C%22c9s_tweet_anatomy_moderator_badge_enabled%22%3Atrue%2C%22responsive_web_grok_analyze_button_fetch_trends_enabled%22%3Afalse%2C%22responsive_web_grok_analyze_post_followups_enabled%22%3Atrue%2C%22responsive_web_jetfuel_frame%22%3Atrue%2C%22responsive_web_grok_share_attachment_enabled%22%3Atrue%2C%22responsive_web_grok_annotations_enabled%22%3Afalse%2C%22articles_preview_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3Atrue%2C%22view_counts_everywhere_api_enabled%22%3Atrue%2C%22longform_notetweets_consumption_enabled%22%3Atrue%2C%22responsive_web_twitter_article_tweet_consumption_enabled%22%3Atrue%2C%22tweet_awards_web_tipping_enabled%22%3Afalse%2C%22responsive_web_grok_show_grok_translated_post%22%3Afalse%2C%22responsive_web_grok_analysis_button_from_backend%22%3Atrue%2C%22post_ctas_fetch_enabled%22%3Afalse%2C%22creator_subscriptions_quote_tweet_preview_enabled%22%3Afalse%2C%22freedom_of_speech_not_reach_fetch_enabled%22%3Atrue%2C%22standardized_nudges_misinfo%22%3Atrue%2C%22tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled%22%3Atrue%2C%22longform_notetweets_rich_text_read_enabled%22%3Atrue%2C%22longform_notetweets_inline_media_enabled%22%3Atrue%2C%22responsive_web_grok_image_annotation_enabled%22%3Atrue%2C%22responsive_web_grok_imagine_annotation_enabled%22%3Atrue%2C%22responsive_web_grok_community_note_auto_translation_is_enabled%22%3Afalse%2C%22responsive_web_enhance_cards_enabled%22%3Afalse%7D&fieldToggles=%7B%22withArticlePlainText%22%3Afalse%7D'
|
||||||
yield scrapy.Request(graphql_url, callback=self.parse,
|
|
||||||
meta={'uid': user_info['userUid'], 'currentCount': 0},
|
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
||||||
|
meta={
|
||||||
|
'uid': user_info['userUid'],
|
||||||
|
'proxy': 'http://127.0.0.1:10809',
|
||||||
|
'currentCount': 0
|
||||||
|
},
|
||||||
cookies=self.cookie_dict, headers=self.header)
|
cookies=self.cookie_dict, headers=self.header)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
@ -164,10 +250,10 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
result = entry['content']['itemContent']['tweet_results']['result']
|
result = entry['content']['itemContent']['tweet_results']['result']
|
||||||
item['es_userid'] = result['core']['user_results']['result']['rest_id']
|
item['es_userid'] = result['core']['user_results']['result']['rest_id']
|
||||||
item['es_hkey'] = result['rest_id']
|
item['es_hkey'] = result['rest_id']
|
||||||
item['es_district'] = result['core']['user_results']['result']['legacy']['location']
|
item['es_district'] = result['core']['user_results']['result']['location']
|
||||||
screen_name = result['core']['user_results']['result']['legacy']['screen_name']
|
screen_name = result['core']['user_results']['result']['core']['screen_name']
|
||||||
user_name = result['core']['user_results']['result']['legacy']['name']
|
user_name = result['core']['user_results']['result']['core']['name']
|
||||||
item['es_urlname'] = 'https://twitter.com/%s/status/%s' % (screen_name, result['rest_id'])
|
item['es_urlname'] = 'https://x.com/%s/status/%s' % (screen_name, result['rest_id'])
|
||||||
item['es_authors'] = screen_name
|
item['es_authors'] = screen_name
|
||||||
item['es_extname'] = user_name
|
item['es_extname'] = user_name
|
||||||
legacy = result['legacy']
|
legacy = result['legacy']
|
||||||
|
|||||||
@ -7,4 +7,4 @@ from scrapy.cmdline import execute
|
|||||||
dirpath = os.path.dirname(os.path.abspath(__file__))
|
dirpath = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
sys.path.append(dirpath)
|
sys.path.append(dirpath)
|
||||||
execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])
|
execute(['scrapy', 'crawl', 'TaobaoUserSpider', '-a', 'params={}'])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user