2025-05-28 19:16:17 +08:00

156 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
BOT_NAME = 'PhotoSpiders'
SPIDER_MODULES = ['PhotoSpiders.spiders']
NEWSPIDER_MODULE = 'PhotoSpiders.spiders'
PROXY_SERVICE = 'http://107.182.191.3:6800'
PER_BATCH_IP_USE_TIMES = 5 # 代理中间件每次从ip池获取一批ip定义这批ip使用次数达到次数后重新从ip池获取新的一批
REDIS_HOST = '107.182.191.3'
REDIS_PORT = '7379'
REDIS_PWD = 'jlkj-841-2-redis'
REDIS_PARAMS = {
'password': 'jlkj-841-2-redis',
}
ZIP_FILE_PATH = ''
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
KAFKA_SERVER = '47.113.231.200:9092'
RANDOMIZE_DOWNLOAD_DELAY = True
# If True, it uses redis' ``SPOP`` operation. You have to use the ``SADD``
# command to add URLs to the redis queue. This could be useful if you
# want to avoid duplicates in your start urls list and the order of
# processing does not matter.
# REDIS_START_URLS_AS_SET = True
# If True, it uses redis ``zrevrange`` and ``zremrangebyrank`` operation. You have to use the ``zadd``
# command to add URLS and Scores to redis queue. This could be useful if you
# want to use priority and avoid duplicates in your start urls list.
DOWNLOAD_TIMEOUT = 90
CRAWL_DEEPTH = 3 # 若起始url为门户首页或板块首页则深度至少要设为2若起始url为内容页则深度至少设为1
PROTO_MODULE_PATH = 'PhotoSpiders.proto.Es_pb2'
PROTO_CLASS_NAME = 'EsSets'
PROTO_FIELD_NAME = 'Es'
PROTO_SAVE_FILE_NAME = r'public_info_data_'
FILES_STORE = r'/usr/local/temp_file'
FILES_URLS_FIELD = 'es_attachment'
FILES_RESULT_FIELD = 'es_attachment'
FILE_ZIP_FILE_NAME = 'attach_data_publicinfo_'
IMAGES_STORE = r'/usr/local/temp_image'
IMAGES_URLS_FIELD = 'es_urlimage'
IMAGES_RESULT_FIELD = 'es_urlimage'
IMG_ZIP_FILE_NAME = 'image_data_publicinfo_'
MYEXT_ENABLED = True # 开启超时关闭scrapy扩展
CRAWL_JOB_UPDATE_API = 'http://47.115.228.133:28081/api/open/crawljob'
BATCH_SAVE_SIZE = 5
PROTO_SAVE_FILE_PATH = '/usr/local/spider_data'
FILE_TRANS_PATH = '/usr/local/spider_data'
RETENTION_HOURS = 120
LOG_LEVEL = 'INFO'
# DEPTH_LIMIT=0
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'PhotoSpiders (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 控制并发采集数量即同时请求url数量也可控制scrapy-redis从队列中读取url得数量
CONCURRENT_REQUESTS = 16
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1.5
# The download delay setting will honor only one of:
# 每个域名的并发采集数量若等于CONCURRENT_REQUESTS则可能会一段时间内只采集某一个站点若假设该值等于1则可能并发采集16个站点假设该值等于2则可能并发采集8个站点
CONCURRENT_REQUESTS_PER_DOMAIN = 4
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
CUSTOM_USER_AGENT = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
]
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'PhotoSpiders.middlewares.DumpFilterSpiderMiddleware': 544
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'PhotoSpiders.middlewares.DumpFilterDownloaderMiddleware': 302,
'PhotoSpiders.middlewares.UserAgentDownloaderMiddleware': 799,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
'PhotoSpiders.extensions.SetCrawlerStatusExtensions': 501
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
'scrapy.pipelines.images.ImagesPipeline': 2,
'PhotoSpiders.pipelines.ProtobufSavePipeline': 300,
# 'PhotoSpiders.pipelines.CustomFilesPipeline': 1
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'