From 8f1999376f968a2f174b5363a70507281ea8e976 Mon Sep 17 00:00:00 2001 From: yuxin-pc Date: Fri, 13 Jun 2025 09:40:52 +0800 Subject: [PATCH] Update settings.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 配置项改回ZQ --- .../WebsiteSpider/WebsiteSpider/settings.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/spiders/WebsiteSpider/WebsiteSpider/settings.py b/spiders/WebsiteSpider/WebsiteSpider/settings.py index 5d513fb..0f0b33e 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/settings.py +++ b/spiders/WebsiteSpider/WebsiteSpider/settings.py @@ -13,10 +13,10 @@ SELENIUM_DRIVER_NAME = 'firefox' SELENIUM_DRIVER_EXECUTABLE_PATH = [ 'http://154.90.40.71:28095', 'http://154.90.63.14:28095', - 'http://156.244.20.57:28095', - # 'http://10.55.13.121:28095', - # 'http://10.55.13.108:28095', - # 'http://10.55.13.3:28095', + # 'http://156.244.20.57:28095', + 'http://10.55.13.121:28095', + 'http://10.55.13.108:28095', + 'http://10.55.13.3:28095', ] SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox SELENIUM_DRIVER_PREFERENCES = { @@ -27,10 +27,10 @@ SELENIUM_DRIVER_PREFERENCES = { PROXY_SERVICE = 'http://107.182.191.3:6800' PER_BATCH_IP_USE_TIMES = 5 # 代理中间件每次从ip池获取一批ip,定义这批ip使用次数,达到次数后重新从ip池获取新的一批 -REDIS_HOST = '38.54.94.107' -REDIS_PORT = '28097' -# REDIS_HOST = '10.55.13.3' -# REDIS_PORT = '7379' +# REDIS_HOST = '38.54.94.107' +# REDIS_PORT = '28097' +REDIS_HOST = '10.55.13.3' +REDIS_PORT = '7379' REDIS_PWD = 'jlkj-841-2-redis' REDIS_PARAMS = { 'password': 'jlkj-841-2-redis', @@ -39,9 +39,9 @@ REDIS_PARAMS = { USCARRIERS_KEY = 'USCARRIERS_ID' ZIP_FILE_PATH = '' -KAFKA_PROCESS_QUEUE = ['stream-protobuf'] -# KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db'] -KAFKA_SERVER = '38.54.125.182:9092' +# KAFKA_PROCESS_QUEUE = ['stream-protobuf'] +KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db'] +KAFKA_SERVER = '47.113.231.200:9092' RANDOMIZE_DOWNLOAD_DELAY = True @@ -61,7 +61,7 @@ CRAWL_DEEPTH = 3 # 若起始url为门户首页或板块首页,则深度至少 # Configure maximum concurrent requests performed by Scrapy (default: 16) # 控制并发采集数量,即同时请求url数量,也可控制scrapy-redis从队列中读取url得数量 -CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS = 12 PROTO_MODULE_PATH = 'WebsiteSpider.proto.Es_pb2' PROTO_CLASS_NAME = 'EsSets' @@ -81,9 +81,9 @@ IMG_ZIP_FILE_NAME = 'image_data_publicinfo_' MYEXT_ENABLED = True # 开启超时关闭scrapy扩展 IDLE_NUMBER = 36 # 配置允许的空闲时长,每5秒会增加一次IDLE_NUMBER,直到增加到60,程序才会close,空闲时间=5*IDLE_NUMBER 秒 -CRAWL_JOB_UPDATE_API = 'http://38.54.94.107:28081/api/open/crawljob' -WORD_BANK_QUERY_API = 'http://38.54.94.107:28081/api/open/wordBank/queryAll' -CRAWL_RULE_QUERY_API = 'http://38.54.94.107:28081/api/open/target/website/queryAll' +CRAWL_JOB_UPDATE_API = 'http://47.115.228.133:28081/api/open/crawljob' +WORD_BANK_QUERY_API = 'http://47.115.228.133:28081/api/open/wordBank/queryAll' +CRAWL_RULE_QUERY_API = 'http://47.115.228.133:28081/api/open/target/website/queryAll' BATCH_SAVE_SIZE = 5 PROTO_SAVE_FILE_PATH = '/usr/local/spider_data'