140 lines
5.7 KiB
Python
140 lines
5.7 KiB
Python
# -*- coding: utf-8 -*-
|
||
import re
|
||
import time
|
||
import logging as logger
|
||
from CustomWebsite.utils.string_utils import transform_full_width_number, trim_space
|
||
|
||
|
||
def get_year(current_year, date_str):
|
||
has_year = False
|
||
year_pattern = "(19|20)\d{2}"
|
||
if re.search(year_pattern, date_str):
|
||
current_year = re.search(year_pattern, date_str).group()
|
||
has_year = True
|
||
return has_year, current_year
|
||
|
||
|
||
def format_day(date_str):
|
||
date_str = trim_space(date_str.replace(",", ""))
|
||
date_array = date_str.split()
|
||
if len(date_array) == 2:
|
||
month = date_array[0]
|
||
year = date_array[1]
|
||
return " ".join([month, "1", year])
|
||
else:
|
||
return date_str
|
||
|
||
|
||
def get_time_stamp(time_str, parse_rule=None):
|
||
reg_pattern_dict = {
|
||
r"\D*(\d+[\/\-年\.]\d+[\/\-月\.]\d+)\D*(\d*[:時]?\d*[:分]?\d*)?\D*": ["%Y-%m-%d %H:%M:%S"],
|
||
r"\D*(\d+年 \d+月 \d+)\D*(\d*[:時]?\d*[:分]?\d*)?\D*": ["%Y-%m-%d %H:%M:%S"],
|
||
r"\D*(\d+[/\-\.月]\d+[/\-\.日]\d{4})\D*(\d*:?\d*:?\d*)?\D*": ["%m-%d-%Y %H:%M:%S"],
|
||
r"([a-zA-Z]+[\.]? \d+[,\.]? \d{4})\D*(\d*:?\d*:?\d*)?\D*": ["%B %d- %Y %H:%M:%S", "%b %d- %Y %H:%M:%S",
|
||
"%b- %d- %Y %H:%M:%S", "%B %d %Y %H:%M:%S",
|
||
"%b %d %Y %H:%M:%S", "%b-%d-%Y %H:%M:%S",
|
||
"%B-%d-%Y %H:%M:%S"],
|
||
r"(\d+ [a-zA-Z]+[,\.]? \d{4})\D*(\d*:?\d*:?\d*)?\D*": ["%d %B- %Y %H:%M:%S", "%d %b- %Y %H:%M:%S",
|
||
"%d %B %Y %H:%M:%S", "%d %b %Y %H:%M:%S",
|
||
"%d-%B-%Y %H:%M:%S", "%d-%b-%Y %H:%M:%S"],
|
||
r"\d{10}": ["%Y-%m-%d %H:%M:%S"],
|
||
r"\d{13}": ["%Y-%m-%d %H:%M:%S"],
|
||
r"\D*([A-Z][a-z]+\s\d+\w+,\s\d{4})\s()\D*": ["%b %drd- %Y %H:%M:%S", "%b %dth- %Y %H:%M:%S",
|
||
"%b %dnd- %Y %H:%M:%S", "%b %dst- %Y %H:%M:%S"]
|
||
}
|
||
if parse_rule:
|
||
reg_pattern_dict = parse_rule
|
||
time_stamp = 0
|
||
time_str = transform_full_width_number(time_str)
|
||
if "September" not in time_str:
|
||
time_str = time_str.replace("Sept", "Sep")
|
||
for key, value in reg_pattern_dict.items():
|
||
format_date = get_format_time(key, time_str)
|
||
if format_date is not None:
|
||
for date_format_str in value:
|
||
try:
|
||
time_stamp = time.mktime(
|
||
time.strptime(format_date, date_format_str))
|
||
break
|
||
except Exception as e:
|
||
logger.debug(e)
|
||
pass
|
||
if time_stamp != 0:
|
||
break
|
||
return int(time_stamp * 1000)
|
||
|
||
|
||
def transfer_time_zone(timestamp, time_zone):
|
||
if timestamp:
|
||
return int(timestamp + (8 - time_zone) * 60 * 60 * 1000)
|
||
else:
|
||
return 0
|
||
|
||
|
||
def get_format_time(pattern, time_str):
|
||
time_str = time_str.replace("元年", "1年")
|
||
result = re.search(pattern, time_str)
|
||
if result is not None:
|
||
ts = result.group(0).strip()
|
||
try:
|
||
# 对直接是时间戳的日期进行识别
|
||
tn = int(ts)
|
||
logger.debug(tn)
|
||
if len(ts) >= 10:
|
||
if len(ts) == 13:
|
||
ts = ts[:10]
|
||
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(ts)))
|
||
except Exception as e:
|
||
logger.debug(e)
|
||
pass
|
||
date = result.group(1)
|
||
time_t = result.group(2)
|
||
date = date.replace('/', '-').replace(".", "-").replace(
|
||
",", "-").replace("年", "-").replace("月", "-").replace("日", "").replace(' ', '-').replace('--', '-')
|
||
date_array = date.split('-')
|
||
for i in range(len(date_array)):
|
||
if (date_array[i].endswith('st') or
|
||
date_array[i].endswith('nd') or
|
||
date_array[i].endswith('rd') or
|
||
date_array[i].endswith('th')) \
|
||
and date_array[i] != 'August':
|
||
date_array[i] = date_array[i][:-2]
|
||
try:
|
||
year = int(date_array[0])
|
||
if '民國' in time_str or '主体' in time_str or '民国' in time_str or (100 < year < 120):
|
||
year += 1911
|
||
if '令和' in time_str:
|
||
year += 2018
|
||
if '平成' in time_str:
|
||
year += 1988
|
||
if 20 < year < 30: # 部分韩国站点的年份只有两位,比如2023年就是23
|
||
year += 2000
|
||
date_array[0] = str(year)
|
||
except ValueError:
|
||
pass
|
||
date = '-'.join(date_array)
|
||
if time_t is None or time_t.strip() == '':
|
||
time_t = '00:00:00'
|
||
else:
|
||
time_t = time_t.replace("時", ":").replace("分", ":")
|
||
time_list = time_t.split(':')
|
||
time_list_length = len(time_list)
|
||
if time_list_length > 0 and len(time_str) < 100 and (
|
||
time_str.upper().find('PM') > 0 or time_str.upper().find('P.M.') > 0): # 12小时制转为24小时制
|
||
time_list[0] = str(int(time_list[0]) + 12)
|
||
while time_list_length < 3:
|
||
time_list.append('00')
|
||
time_list_length += 1
|
||
time_t = ':'.join(time_list)
|
||
return date + ' ' + time_t
|
||
else:
|
||
return None
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
||
a = ['(平成31年3月27日掲載)', '(令和元年3月27日掲載)']
|
||
for _ in a:
|
||
print(get_time_stamp(_))
|
||
# print(get_time_stamp(_, {r"(\d{4}年\d{1,2}月\d{2}日)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))
|