78 lines
2.9 KiB
Python
78 lines
2.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
import json
|
|
import logging as logger
|
|
import random
|
|
import re
|
|
import time
|
|
from urllib import parse
|
|
|
|
import redis
|
|
import scrapy
|
|
from scrapy_selenium import SeleniumRequest
|
|
|
|
from MediaSpiders.items import MediaspidersItem
|
|
from MediaSpiders.utils.http_utils import http_post
|
|
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
|
|
|
|
|
class TwitterSpider(scrapy.Spider):
|
|
name = 'RedditUserSpider'
|
|
custom_settings = {
|
|
'PROTO_MODULE_PATH': 'MediaSpiders.proto.Es_pb2',
|
|
'PROTO_CLASS_NAME': 'EsSets',
|
|
'PROTO_FIELD_NAME': 'Es',
|
|
'PROTO_SAVE_FILE_NAME': 'public_info_data_',
|
|
'IMAGES_STORE': r'/usr/local/temp_image/reddit',
|
|
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
|
'FILES_STORE': r'/usr/local/videos',
|
|
'FILES_RESULT_FIELD': 'es_video',
|
|
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
|
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
|
'ITEM_PIPELINES': {
|
|
'scrapy.pipelines.images.ImagesPipeline': 2,
|
|
'scrapy.pipelines.files.FilesPipeline': 1,
|
|
'MediaSpiders.pipelines.ProtobufSavePipeline': 300,
|
|
},
|
|
'SPIDER_MIDDLEWARES': {
|
|
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
|
|
# 'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
|
|
# 'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
|
|
# 'scrapy_splash.SplashDeduplicateArgsMiddleware': 700,
|
|
}
|
|
|
|
}
|
|
|
|
def __init__(self, params=None, *args, **kwargs):
|
|
super(TwitterSpider, self).__init__(*args, **kwargs)
|
|
self.total_num = 100
|
|
self.authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
|
if params:
|
|
json_params = json.loads(params)
|
|
if 'totalNum' in json_params:
|
|
self.total_num = int(json_params['totalNum'])
|
|
if 'authorization' in json_params:
|
|
self.authorization = json_params['authorization']
|
|
if 'job_id' in json_params:
|
|
self.job_id = json_params['job_id']
|
|
|
|
def start_requests(self):
|
|
yield SeleniumRequest(url='https://www.google.com/')
|
|
|
|
def parse(self, response):
|
|
driver = response.request.meta['driver']
|
|
driver.maximize_window()
|
|
driver.get('https://www.reddit.com/')
|
|
while True:
|
|
delay = random.randint(5, 11)
|
|
time.sleep(delay)
|
|
if '/user/' in driver.current_url:
|
|
print("跳转用户页面")
|
|
break
|
|
for i in range(3):
|
|
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
|
|
time.sleep(5)
|
|
item_lines = driver.find_elements_by_xpath("//main[@id='main-content']//article")
|
|
for line in item_lines:
|
|
content = line.find_element_by_xpath(".//div[@id='-post-rtjson-content']").text
|
|
pass
|