2025-05-28 19:16:17 +08:00

140 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
import time
import logging as logger
from CustomWebsite.utils.string_utils import transform_full_width_number, trim_space
def get_year(current_year, date_str):
has_year = False
year_pattern = "(19|20)\d{2}"
if re.search(year_pattern, date_str):
current_year = re.search(year_pattern, date_str).group()
has_year = True
return has_year, current_year
def format_day(date_str):
date_str = trim_space(date_str.replace(",", ""))
date_array = date_str.split()
if len(date_array) == 2:
month = date_array[0]
year = date_array[1]
return " ".join([month, "1", year])
else:
return date_str
def get_time_stamp(time_str, parse_rule=None):
reg_pattern_dict = {
r"\D*(\d+[\/\-年\.]\d+[\/\-月\.]\d+)\D*(\d*[:時]?\d*[:分]?\d*)?\D*": ["%Y-%m-%d %H:%M:%S"],
r"\D*(\d+年 \d+月 \d+)\D*(\d*[:時]?\d*[:分]?\d*)?\D*": ["%Y-%m-%d %H:%M:%S"],
r"\D*(\d+[/\-\.月]\d+[/\-\.日]\d{4})\D*(\d*:?\d*:?\d*)?\D*": ["%m-%d-%Y %H:%M:%S"],
r"([a-zA-Z]+[\.]? \d+[,\.]? \d{4})\D*(\d*:?\d*:?\d*)?\D*": ["%B %d- %Y %H:%M:%S", "%b %d- %Y %H:%M:%S",
"%b- %d- %Y %H:%M:%S", "%B %d %Y %H:%M:%S",
"%b %d %Y %H:%M:%S", "%b-%d-%Y %H:%M:%S",
"%B-%d-%Y %H:%M:%S"],
r"(\d+ [a-zA-Z]+[,\.]? \d{4})\D*(\d*:?\d*:?\d*)?\D*": ["%d %B- %Y %H:%M:%S", "%d %b- %Y %H:%M:%S",
"%d %B %Y %H:%M:%S", "%d %b %Y %H:%M:%S",
"%d-%B-%Y %H:%M:%S", "%d-%b-%Y %H:%M:%S"],
r"\d{10}": ["%Y-%m-%d %H:%M:%S"],
r"\d{13}": ["%Y-%m-%d %H:%M:%S"],
r"\D*([A-Z][a-z]+\s\d+\w+,\s\d{4})\s()\D*": ["%b %drd- %Y %H:%M:%S", "%b %dth- %Y %H:%M:%S",
"%b %dnd- %Y %H:%M:%S", "%b %dst- %Y %H:%M:%S"]
}
if parse_rule:
reg_pattern_dict = parse_rule
time_stamp = 0
time_str = transform_full_width_number(time_str)
if "September" not in time_str:
time_str = time_str.replace("Sept", "Sep")
for key, value in reg_pattern_dict.items():
format_date = get_format_time(key, time_str)
if format_date is not None:
for date_format_str in value:
try:
time_stamp = time.mktime(
time.strptime(format_date, date_format_str))
break
except Exception as e:
logger.debug(e)
pass
if time_stamp != 0:
break
return int(time_stamp * 1000)
def transfer_time_zone(timestamp, time_zone):
if timestamp:
return int(timestamp + (8 - time_zone) * 60 * 60 * 1000)
else:
return 0
def get_format_time(pattern, time_str):
time_str = time_str.replace("元年", "1年")
result = re.search(pattern, time_str)
if result is not None:
ts = result.group(0).strip()
try:
# 对直接是时间戳的日期进行识别
tn = int(ts)
logger.debug(tn)
if len(ts) >= 10:
if len(ts) == 13:
ts = ts[:10]
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(ts)))
except Exception as e:
logger.debug(e)
pass
date = result.group(1)
time_t = result.group(2)
date = date.replace('/', '-').replace(".", "-").replace(
",", "-").replace("", "-").replace("", "-").replace("", "").replace(' ', '-').replace('--', '-')
date_array = date.split('-')
for i in range(len(date_array)):
if (date_array[i].endswith('st') or
date_array[i].endswith('nd') or
date_array[i].endswith('rd') or
date_array[i].endswith('th')) \
and date_array[i] != 'August':
date_array[i] = date_array[i][:-2]
try:
year = int(date_array[0])
if '民國' in time_str or '主体' in time_str or '民国' in time_str or (100 < year < 120):
year += 1911
if '令和' in time_str:
year += 2018
if '平成' in time_str:
year += 1988
if 20 < year < 30: # 部分韩国站点的年份只有两位比如2023年就是23
year += 2000
date_array[0] = str(year)
except ValueError:
pass
date = '-'.join(date_array)
if time_t is None or time_t.strip() == '':
time_t = '00:00:00'
else:
time_t = time_t.replace("", ":").replace("", ":")
time_list = time_t.split(':')
time_list_length = len(time_list)
if time_list_length > 0 and len(time_str) < 100 and (
time_str.upper().find('PM') > 0 or time_str.upper().find('P.M.') > 0): # 12小时制转为24小时制
time_list[0] = str(int(time_list[0]) + 12)
while time_list_length < 3:
time_list.append('00')
time_list_length += 1
time_t = ':'.join(time_list)
return date + ' ' + time_t
else:
return None
if __name__ == '__main__':
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
a = ['平成31年3月27日掲載', '令和元年3月27日掲載']
for _ in a:
print(get_time_stamp(_))
# print(get_time_stamp(_, {r"(\d{4}年\d{1,2}月\d{2}日)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))