jsc-dsp #1
@ -59,7 +59,14 @@ class SeleniumMiddleware:
|
|||||||
# Edge in headless mode
|
# Edge in headless mode
|
||||||
edge_options = EdgeOptions()
|
edge_options = EdgeOptions()
|
||||||
edge_options.use_chromium = True
|
edge_options.use_chromium = True
|
||||||
self.driver = Edge(executable_path='MicrosoftWebDriver.exe', options=edge_options)
|
self.driver = Edge(executable_path='msedgedriver.exe', options=edge_options)
|
||||||
|
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||||
|
"source": """
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
})
|
||||||
|
"""
|
||||||
|
})
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler):
|
def from_crawler(cls, crawler):
|
||||||
|
|||||||
@ -74,26 +74,20 @@ TWITTER_PID_KEY = ''
|
|||||||
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
|
KAFKA_PROCESS_QUEUE = ['stream-protobuf', 'stream-db']
|
||||||
|
|
||||||
CUSTOM_USER_AGENT = [
|
CUSTOM_USER_AGENT = [
|
||||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0',
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
|
|
||||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
|
||||||
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
|
|
||||||
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0',
|
|
||||||
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E',
|
|
||||||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# 部署在外网采集fb时使用selenium_chrome
|
# 部署在外网采集fb时使用selenium_chrome
|
||||||
SELENIUM_DRIVER_NAME = 'chrome'
|
SELENIUM_DRIVER_NAME = 'chrome'
|
||||||
SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
||||||
|
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
||||||
SELENIUM_DRIVER_ARGUMENTS = [
|
SELENIUM_DRIVER_ARGUMENTS = [
|
||||||
'--headless',
|
'--headless',
|
||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
'--disable-dev-shm-usage',
|
'--disable-dev-shm-usage',
|
||||||
'--disable-gpu',
|
'--disable-gpu',
|
||||||
'--window-size=1920,1080'
|
'--window-size=1920,1080',
|
||||||
|
'--disable-blink-features=AutomationControlled'
|
||||||
]
|
]
|
||||||
|
|
||||||
# 本地调试用
|
# 本地调试用
|
||||||
|
|||||||
@ -65,7 +65,7 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
logger.info("login twitter")
|
logger.info("login twitter")
|
||||||
driver = response.request.meta['driver']
|
driver = response.request.meta['driver']
|
||||||
driver.maximize_window()
|
driver.maximize_window()
|
||||||
driver.get('https://twitter.com/i/flow/login')
|
driver.get('https://x.com/i/flow/login')
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
# 获取采集登录账号并登录
|
# 获取采集登录账号并登录
|
||||||
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
||||||
|
|||||||
@ -87,7 +87,9 @@ def get_format_time(pattern, time_str):
|
|||||||
date = result.group(1)
|
date = result.group(1)
|
||||||
time_t = result.group(2)
|
time_t = result.group(2)
|
||||||
date = date.replace('/', '-').replace(".", "-").replace(
|
date = date.replace('/', '-').replace(".", "-").replace(
|
||||||
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(' ', '-').replace('--', '-')
|
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(
|
||||||
|
"년", "-").replace("월", "-").replace("일", "").replace(
|
||||||
|
' ', '-').replace('--', '-')
|
||||||
date_array = date.split('-')
|
date_array = date.split('-')
|
||||||
for i in range(len(date_array)):
|
for i in range(len(date_array)):
|
||||||
if (date_array[i].endswith('st') or
|
if (date_array[i].endswith('st') or
|
||||||
@ -128,7 +130,7 @@ def get_format_time(pattern, time_str):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
||||||
a = ['06.10.2023 03:24']
|
a = ['2026년 1월 6일 화요일 1면 [사진있음]']
|
||||||
for _ in a:
|
for _ in a:
|
||||||
print(get_time_stamp(_))
|
# print(get_time_stamp(_))
|
||||||
# print(get_time_stamp(_, {r"(\d{4}年\d{1,2}月\d{2}日)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))
|
print(get_time_stamp(_, {r"(\d{4}년 \d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))
|
||||||
|
|||||||
@ -7,4 +7,4 @@ from scrapy.cmdline import execute
|
|||||||
dirpath = os.path.dirname(os.path.abspath(__file__))
|
dirpath = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
sys.path.append(dirpath)
|
sys.path.append(dirpath)
|
||||||
execute(['scrapy', 'crawl', 'FacebookUserSpider', '-a', 'params={}'])
|
execute(['scrapy', 'crawl', 'TwitterUserSpider', '-a', 'params={}'])
|
||||||
|
|||||||
@ -89,7 +89,9 @@ def get_format_time(pattern, time_str):
|
|||||||
date = result.group(1)
|
date = result.group(1)
|
||||||
time_t = result.group(2)
|
time_t = result.group(2)
|
||||||
date = date.replace('/', '-').replace(".", "-").replace(
|
date = date.replace('/', '-').replace(".", "-").replace(
|
||||||
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(' ', '-').replace('--', '-')
|
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(
|
||||||
|
"년", "-").replace("월", "-").replace("일", "").replace(
|
||||||
|
' ', '-').replace('--', '-')
|
||||||
date_array = date.split('-')
|
date_array = date.split('-')
|
||||||
for i in range(len(date_array)):
|
for i in range(len(date_array)):
|
||||||
if (date_array[i].endswith('st') or
|
if (date_array[i].endswith('st') or
|
||||||
@ -135,7 +137,7 @@ def get_format_time(pattern, time_str):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
||||||
a = ['July 26, 2024 12:53 PM']
|
a = ['2026년 1월 6일 화요일 1면 [사진있음]']
|
||||||
for _ in a:
|
for _ in a:
|
||||||
print(get_time_stamp(_))
|
# print(get_time_stamp(_))
|
||||||
# print(get_time_stamp(_, {r"(\w+ \d+, \d{4})\D*(\d+:\d+)\D*": ['%B-%d-%Y %H:%M:%S']}))
|
print(get_time_stamp(_, {r"(\d{4}년 \d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user