# -*- coding: utf-8 -*- import json import logging as logger import random import re import time from urllib import parse import redis import scrapy from scrapy_selenium import SeleniumRequest from MediaSpiders.items import MediaspidersItem from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp class TwitterSpider(scrapy.Spider): name = 'RedditUserSpider' custom_settings = { 'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2', 'PROTO_CLASS_NAME': 'EsSets', 'PROTO_FIELD_NAME': 'Es', 'PROTO_SAVE_FILE_NAME': 'public_info_data_', 'IMAGES_STORE': r'/usr/local/temp_image/reddit', 'IMAGES_RESULT_FIELD': 'es_urlimage', 'FILES_STORE': r'/usr/local/videos', 'FILES_RESULT_FIELD': 'es_video', 'ZIP_FILE_NAME': 'image_data_publicinfo_', 'FILE_ZIP_FILE_NAME': 'image_data_plane_', 'ITEM_PIPELINES': { 'scrapy.pipelines.images.ImagesPipeline': 2, 'scrapy.pipelines.files.FilesPipeline': 1, 'MediaSpiders.pipelines.ProtobufSavePipeline': 300, }, 'SPIDER_MIDDLEWARES': { 'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543, # 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544, # 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545, # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700, } } def __init__(self, params=None, *args, **kwargs): super(TwitterSpider, self).__init__(*args, **kwargs) self.total_num = 100 self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' if params: json_params = json.loads(params) if 'totalNum' in json_params: self.total_num = int(json_params['totalNum']) if 'authorization' in json_params: self.authorization = json_params['authorization'] if 'job_id' in json_params: self.job_id = json_params['job_id'] def start_requests(self): yield SeleniumRequest(url='https://www.google.com/') def parse(self, response): driver = response.request.meta['driver'] driver.maximize_window() driver.get('https://www.reddit.com/') while True: delay = random.randint(5, 11) time.sleep(delay) if '/user/' in driver.current_url: print("跳转用户页面") break for i in range(3): driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(5) item_lines = driver.find_elements_by_xpath("//main[@id='main-content']//article") for line in item_lines: content = line.find_element_by_xpath(".//div[@id='-post-rtjson-content']").text pass