diff --git a/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py b/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py index 2df273f..5788a53 100644 --- a/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py +++ b/spiders/MediaSpiders/MediaSpiders/scrapy_selenium/middlewares.py @@ -59,7 +59,14 @@ class SeleniumMiddleware: # Edge in headless mode edge_options = EdgeOptions() edge_options.use_chromium = True - self.driver = Edge(executable_path='MicrosoftWebDriver.exe', options=edge_options) + self.driver = Edge(executable_path='msedgedriver.exe', options=edge_options) + self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { + "source": """ + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }) + """ + }) @classmethod def from_crawler(cls, crawler): diff --git a/spiders/MediaSpiders/MediaSpiders/settings.py b/spiders/MediaSpiders/MediaSpiders/settings.py index bf0820a..d248fb3 100644 --- a/spiders/MediaSpiders/MediaSpiders/settings.py +++ b/spiders/MediaSpiders/MediaSpiders/settings.py @@ -74,26 +74,20 @@ TWITTER_PID_KEY = '' KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db'] CUSTOM_USER_AGENT = [ - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50', - 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E', - 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0', ] # 部署在外网采集fb时使用selenium_chrome SELENIUM_DRIVER_NAME = 'chrome' -SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' +SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' +# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' SELENIUM_DRIVER_ARGUMENTS = [ '--headless', '--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu', - '--window-size=1920,1080' + '--window-size=1920,1080', + '--disable-blink-features=AutomationControlled' ] # 本地调试用 diff --git a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py index 4884797..d83cba8 100644 --- a/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py +++ b/spiders/MediaSpiders/MediaSpiders/spiders/TwitterUserSpider.py @@ -65,7 +65,7 @@ class TwitterSpider(scrapy.Spider): logger.info("login twitter") driver = response.request.meta['driver'] driver.maximize_window() - driver.get('https://twitter.com/i/flow/login') + driver.get('https://x.com/i/flow/login') time.sleep(5) # 获取采集登录账号并登录 login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts') diff --git a/spiders/MediaSpiders/MediaSpiders/utils/date_utils.py b/spiders/MediaSpiders/MediaSpiders/utils/date_utils.py index 5ecb4be..e641be1 100644 --- a/spiders/MediaSpiders/MediaSpiders/utils/date_utils.py +++ b/spiders/MediaSpiders/MediaSpiders/utils/date_utils.py @@ -87,7 +87,9 @@ def get_format_time(pattern, time_str): date = result.group(1) time_t = result.group(2) date = date.replace('/', '-').replace(".", "-").replace( - ",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(' ', '-').replace('--', '-') + ",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace( + "년", "-").replace("월", "-").replace("일", "").replace( + ' ', '-').replace('--', '-') date_array = date.split('-') for i in range(len(date_array)): if (date_array[i].endswith('st') or @@ -128,7 +130,7 @@ def get_format_time(pattern, time_str): if __name__ == '__main__': # a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日'] - a = ['06.10.2023 03:24'] + a = ['2026년 1월 6일 화요일 1면 [사진있음]'] for _ in a: - print(get_time_stamp(_)) - # print(get_time_stamp(_, {r"(\d{4}年\d{1,2}月\d{2}日)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']})) + # print(get_time_stamp(_)) + print(get_time_stamp(_, {r"(\d{4}년 \d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']})) diff --git a/spiders/MediaSpiders/run.py b/spiders/MediaSpiders/run.py index 0d4e91c..34f1f5c 100644 --- a/spiders/MediaSpiders/run.py +++ b/spiders/MediaSpiders/run.py @@ -7,4 +7,4 @@ from scrapy.cmdline import execute dirpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(dirpath) -execute(['scrapy', 'crawl', 'FacebookUserSpider', '-a', 'params={}']) +execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}']) diff --git a/spiders/WebsiteSpider/WebsiteSpider/utils/date_utils.py b/spiders/WebsiteSpider/WebsiteSpider/utils/date_utils.py index 62e9bb1..c213da3 100644 --- a/spiders/WebsiteSpider/WebsiteSpider/utils/date_utils.py +++ b/spiders/WebsiteSpider/WebsiteSpider/utils/date_utils.py @@ -89,7 +89,9 @@ def get_format_time(pattern, time_str): date = result.group(1) time_t = result.group(2) date = date.replace('/', '-').replace(".", "-").replace( - ",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(' ', '-').replace('--', '-') + ",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace( + "년", "-").replace("월", "-").replace("일", "").replace( + ' ', '-').replace('--', '-') date_array = date.split('-') for i in range(len(date_array)): if (date_array[i].endswith('st') or @@ -135,7 +137,7 @@ def get_format_time(pattern, time_str): if __name__ == '__main__': # a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日'] - a = ['July 26, 2024 12:53 PM'] + a = ['2026년 1월 6일 화요일 1면 [사진있음]'] for _ in a: - print(get_time_stamp(_)) - # print(get_time_stamp(_, {r"(\w+ \d+, \d{4})\D*(\d+:\d+)\D*": ['%B-%d-%Y %H:%M:%S']})) + # print(get_time_stamp(_)) + print(get_time_stamp(_, {r"(\d{4}년 \d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))