osc/research/archive/RedditUserSpider.py

78 lines
2.9 KiB
Python
Raw Normal View History

2025-05-28 19:16:17 +08:00
# -*- coding: utf-8 -*-
import json
import logging as logger
import random
import re
import time
from urllib import parse
import redis
import scrapy
from scrapy_selenium import SeleniumRequest
from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
class TwitterSpider(scrapy.Spider):
name = 'RedditUserSpider'
custom_settings = {
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
'PROTO_CLASS_NAME': 'EsSets',
'PROTO_FIELD_NAME': 'Es',
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
'IMAGES_STORE': r'/usr/local/temp_image/reddit',
'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_',
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1,
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
},
'SPIDER_MIDDLEWARES': {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
# 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
# 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700,
}
}
def __init__(self, params=None, *args, **kwargs):
super(TwitterSpider, self).__init__(*args, **kwargs)
self.total_num = 100
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
if params:
json_params = json.loads(params)
if 'totalNum' in json_params:
self.total_num = int(json_params['totalNum'])
if 'authorization' in json_params:
self.authorization = json_params['authorization']
if 'job_id' in json_params:
self.job_id = json_params['job_id']
def start_requests(self):
yield SeleniumRequest(url='https://www.google.com/')
def parse(self, response):
driver = response.request.meta['driver']
driver.maximize_window()
driver.get('https://www.reddit.com/')
while True:
delay = random.randint(5, 11)
time.sleep(delay)
if '/user/' in driver.current_url:
print("跳转用户页面")
break
for i in range(3):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(5)
item_lines = driver.find_elements_by_xpath("//main[@id='main-content']//article")
for line in item_lines:
content = line.find_element_by_xpath(".//div[@id='-post-rtjson-content']").text
pass