2026-01-19 09:17:26 +08:00

213 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
BOT_NAME = 'MediaSpiders'
LOG_LEVEL = 'INFO'
SPIDER_MODULES = ['MediaSpiders.spiders']
NEWSPIDER_MODULE = 'MediaSpiders.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'MediaSpiders (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROTO_SAVE_FILE_PATH = '/usr/local/spider_data'
FILE_TRANS_PATH = '/usr/local/spider_data'
TOKEN_API = 'http://127.0.0.1:8088/api/token/'
PROXY_SERVICE = 'http://39.101.194.63:6800/'
PER_BATCH_IP_USE_TIMES = 50 # 代理中间件每次从ip池获取一批ip定义这批ip使用次数达到次数后重新从ip池获取新的一批
REDIS_HOST = '107.182.191.3'
REDIS_PORT = 7379
REDIS_PWD = 'jlkj-841-2-redis'
REDIS_PARAMS = {
'password': 'jlkj-841-2-redis',
}
MYSQL_DB_HOST = '39.101.194.63'
MYSQL_DB_PORT = 23306
MYSQL_DB_USER = 'root'
MYSQL_DB_PASSWD = 'passok123A'
MYSQL_DB_SCHEMA = 'oscm'
CRAWL_JOB_UPDATE_API = 'http://47.115.228.133:28081/api/open/crawljob'
WORD_BANK_QUERY_API = 'http://47.115.228.133:28081/api/open/wordBank/queryAll'
RULES_PARSER_QUERY_API = 'http://47.115.228.133:28081/api/rules/parser/queryPageable/0/1'
KAFKA_SERVER = '47.113.231.200:9092'
KAFKA_TOPIC = 'stream-protobuf'
BATCH_SAVE_SIZE = 5
TWITTER_FILTER_KEY = 'URL_Filter:MediaSpiders:Twitter_Filter'
FACEBOOK_FILTER_KEY = 'URL_Filter:MediaSpiders:Facebook_Filter'
YOUTUBE_FILTER_KEY = 'URL_Filter:MediaSpiders:Youtube_Filter'
WEIBO_FILTER_KEY = 'URL_Filter:MediaSpiders:Weibo_Filter'
WECHAT_FILTER_KEY = 'URL_Filter:MediaSpiders:Wechat_Filter'
FLICKR_FILTER_KEY = 'URL_Filter:MediaSpiders:Flickr_Filter'
TWITTER_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Twitter_Filter'
FACEBOOK_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Facebook_Filter'
YOUTUBE_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Youtube_Filter'
WEIBO_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Weibo_Filter'
WECHAT_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Wechat_Filter'
FLICKR_SIMHASH_FILTER_KEY = 'SimHash_Filter:MediaSpiders:Flickr_Filter'
WECHAT_LINKS_KEY = "MediaSpiders:Wechat_links"
SOCIAL_USER_QUERY_ALL_API = "http://47.115.228.133:28081/api/open/target/social/queryAll?sortBy={sortBy}&shuffleResult={shuffleResult}"
SOCIAL_USER_UPDATE_API = "http://47.115.228.133:28081/api/open/target/social/update"
WEIBO_USER_TYPE = 0
TWITTER_USER_TYPE = 1
FACEBOOK_USER_TYPE = 2
YOUTUBE_USER_TYPE = 3
FLICKR_USER_TYPE = 4
WECHAT_USER_TYPE = 5
TWITTER_COMMENT_FILTER_KEY = 'MediaSpiders:Twitter_Comment_Filter'
TWITTER_COMMENT_URL_KEY = 'MediaSpiders:Twitter_Comment_URL_Key'
TWITTER_URL_KEY = 'MediaSpiders:Twitter_URL_Key'
TWITTER_PID_KEY = ''
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
CUSTOM_USER_AGENT = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
]
# 部署在外网采集fb时使用selenium_chrome
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
SELENIUM_DRIVER_ARGUMENTS = [
'--headless',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--window-size=1920,1080',
'--disable-blink-features=AutomationControlled'
]
# 本地调试用
# SELENIUM_DRIVER_NAME = 'edge'
# SELENIUM_DRIVER_EXECUTABLE_PATH = "MicrosoftWebDriver.exe"
# # '--headless' if using chrome instead of firefox
# SELENIUM_DRIVER_ARGUMENTS = [
# # '--headless',
# '--start-maximized',
# '--no-sandbox',
# '--disable-dev-shm-usage',
# '--disable-gpu'
# ]
FACEBOOK_LOGIN_USER_KEYNAME = 'FACEBOOK_LOGIN_USERS'
# SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
# SELENIUM_DRIVER_NAME = 'firefox'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://selenium_firefox/wd/hub'
# SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver')
# SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox ,'--proxy-server=http://192.168.199.243:10809'
SPLASH_URL = 'http://107.182.191.3:28050/'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
RANDOMIZE_DOWNLOAD_DELAY = True
DOWNLOAD_DELAY = 5
DOWNLOAD_TIMEOUT = 60
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
RETENTION_HOURS = 120
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'application/json, text/plain, */*',
# 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
# 'Connection': 'keep-alive',
# 'Host': 'm.weibo.cn',
# # 'Referer':'https://m.weibo.cn/u/3893259857?uid=3893259857&luicode=10000011&lfid=1076033893259857',
# 'TE':'Trailers',
# 'X-Requested-With':'XMLHttpRequest'
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'MediaSpiders.middlewares.DumpFilterSpiderMiddleware': 543,
'MediaSpiders.middlewares.KeywordFilterSpiderMiddleware': 544,
'MediaSpiders.middlewares.SimhashFilterSpiderMiddleware': 545,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 700,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 本地测试时需要开启MyProxyDownloaderMiddleware同时注意修改middleware中的相应位置的代码用于本地测试
# 采集微博时可不开启MediaSpiders.scrapy_selenium.SeleniumMiddleware
DOWNLOADER_MIDDLEWARES = {
'MediaSpiders.middlewares.DumpFilterDownloaderMiddleware': 543,
# 'MediaSpiders.middlewares.MyProxyDownloaderMiddleware': 544,
'MediaSpiders.middlewares.UserAgentDownloaderMiddleware': 545,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'MediaSpiders.scrapy_selenium.SeleniumMiddleware': 820
}
EXTENSIONS = {
'MediaSpiders.extensions.SetCrawlerStatusExtensions': 501
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# 'MediaSpiders.pipelines.MediaspidersPipeline': 300,
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'