yuxin-pc 63b0d7090a Update settings.py
翻译字数设置
2026-02-12 09:36:06 +08:00

177 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
BOT_NAME = 'WebsiteSpider'
SPIDER_MODULES = ['WebsiteSpider.spiders']
NEWSPIDER_MODULE = 'WebsiteSpider.spiders'
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
SELENIUM_DRIVER_NAME = 'firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH = [
'http://10.55.13.121:28095',
# 'http://10.55.13.108:28095',
'http://10.55.13.3:28095',
'http://74.121.148.204:28095'
]
SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox
SELENIUM_DRIVER_PREFERENCES = {
"permissions.default.stylesheet": 2
# "permissions.default.image": 2
}
PROXY_SERVICE = 'http://107.182.191.3:6800'
PER_BATCH_IP_USE_TIMES = 5 # 代理中间件每次从ip池获取一批ip定义这批ip使用次数达到次数后重新从ip池获取新的一批
# REDIS_HOST = '38.54.94.107'
# REDIS_PORT = '28097'
# REDIS_HOST = '10.55.13.3'
# REDIS_PORT = '7379'
REDIS_HOST = '107.182.191.3'
REDIS_PORT = 7379
REDIS_PWD = 'jlkj-841-2-redis'
REDIS_PARAMS = {
'password': 'jlkj-841-2-redis',
}
USCARRIERS_KEY = 'USCARRIERS_ID'
ZIP_FILE_PATH = ''
# KAFKA_PROCESS_QUEUE = ['stream-protobuf']
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
KAFKA_SERVER = '47.113.231.200:9092'
RANDOMIZE_DOWNLOAD_DELAY = True
# If True, it uses redis' ``SPOP`` operation. You have to use the ``SADD``
# command to add URLs to the redis queue. This could be useful if you
# want to avoid duplicates in your start urls list and the order of
# processing does not matter.
# REDIS_START_URLS_AS_SET = True
# If True, it uses redis ``zrevrange`` and ``zremrangebyrank`` operation. You have to use the ``zadd``
# command to add URLS and Scores to redis queue. This could be useful if you
# want to use priority and avoid duplicates in your start urls list.
REDIS_START_URLS_AS_ZSET = True
DOWNLOAD_TIMEOUT = 90
CRAWL_DEEPTH = 3 # 若起始url为门户首页或板块首页则深度至少要设为2若起始url为内容页则深度至少设为1
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 控制并发采集数量即同时请求url数量也可控制scrapy-redis从队列中读取url得数量
CONCURRENT_REQUESTS = 12
PROTO_MODULE_PATH = 'WebsiteSpider.proto.Es_pb2'
PROTO_CLASS_NAME = 'EsSets'
PROTO_FIELD_NAME = 'Es'
PROTO_SAVE_FILE_NAME = r'public_info_data_'
FILES_STORE = r'/usr/local/temp_file'
FILES_URLS_FIELD = 'es_attachment'
FILES_RESULT_FIELD = 'es_attachment'
FILE_ZIP_FILE_NAME = 'attach_data_publicinfo_'
IMAGES_STORE = r'/usr/local/temp_image'
IMAGES_URLS_FIELD = 'es_urlimage'
IMAGES_RESULT_FIELD = 'es_urlimage'
IMG_ZIP_FILE_NAME = 'image_data_publicinfo_'
MYEXT_ENABLED = True # 开启超时关闭scrapy扩展
IDLE_NUMBER = 36 # 配置允许的空闲时长每5秒会增加一次IDLE_NUMBER直到增加到60程序才会close,空闲时间=5*IDLE_NUMBER 秒
CRAWL_JOB_UPDATE_API = 'http://47.115.228.133:28081/api/open/crawljob'
WORD_BANK_QUERY_API = 'http://47.115.228.133:28081/api/open/wordBank/queryAll'
CRAWL_RULE_QUERY_API = 'http://47.115.228.133:28081/api/open/target/website/queryAll'
BATCH_SAVE_SIZE = 5
PROTO_SAVE_FILE_PATH = '/usr/local/spider_data'
FILE_TRANS_PATH = '/usr/local/spider_data'
RETENTION_HOURS = 120
LOG_LEVEL = 'INFO'
# DEPTH_LIMIT=0
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'WebsiteSpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.1
# The download delay setting will honor only one of:
# 每个域名的并发采集数量若等于CONCURRENT_REQUESTS则可能会一段时间内只采集某一个站点若假设该值等于1则可能并发采集16个站点假设该值等于2则可能并发采集8个站点
CONCURRENT_REQUESTS_PER_DOMAIN = 4
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
CUSTOM_USER_AGENT = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
]
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'WebsiteSpider.middlewares.SimhashFilterSpiderMiddleware': 543,
'WebsiteSpider.middlewares.DumpFilterSpiderMiddleware': 544,
# 'WebsiteSpider.middlewares.KeywordFilterSpiderMiddleware': 545
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'WebsiteSpider.middlewares.InitDownloaderMiddleware': 299,
'WebsiteSpider.middlewares.DeepLimitDownloaderMiddleware': 300,
'WebsiteSpider.middlewares.KeywordFilterDownloaderMiddleware': 301,
'WebsiteSpider.middlewares.DumpFilterDownloaderMiddleware': 302,
'WebsiteSpider.middlewares.AutoProxyDownloaderMiddleware': 305,
'WebsiteSpider.middlewares.UserAgentDownloaderMiddleware': 799,
'WebsiteSpider.scrapy_selenium.SeleniumMiddleware': 800
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
'WebsiteSpider.extensions.RedisSpiderSmartIdleClosedExtensions': 500,
'WebsiteSpider.extensions.SetCrawlerStatusExtensions': 501
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'WebsiteSpider.pipelines.CustomAttachmentDownload': 1,
'scrapy.pipelines.images.ImagesPipeline': 2,
'WebsiteSpider.pipelines.ProtobufSavePipeline': 300,
}
############################## 翻译
MAX_TEXT_LENGTH = 5999
# 翻译 API 地址(替换为你的服务器 IP 或域名)
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
# 单次请求间隔(秒),避免 API 被限流
REQUEST_DELAY = 1