113 lines
4.8 KiB
Python
113 lines
4.8 KiB
Python
|
|
import os
|
|||
|
|
import shutil
|
|||
|
|
|
|||
|
|
from lxml import etree
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
target_file_path = r"E:/yuxin/nuofang-data/activity/webpage"
|
|||
|
|
result_folder_path = r"E:/yuxin/nuofang-data/activity/result"
|
|||
|
|
image_result_path = r'E:/yuxin/nuofang-data/activity/images'
|
|||
|
|
|
|||
|
|
|
|||
|
|
def trim_space(input_str):
|
|||
|
|
result = re.sub(' {2,}', ' ', input_str)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def trim_n_space(input_str):
|
|||
|
|
for i in range(5):
|
|||
|
|
input_str = input_str.replace("\\n \\n ", "\\n ")
|
|||
|
|
input_str = input_str.replace("\\n :", ":")
|
|||
|
|
return input_str
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_table_lines(table_areas):
|
|||
|
|
_contents = []
|
|||
|
|
for table_area in table_areas:
|
|||
|
|
table_lines = table_area.xpath(".//tr")
|
|||
|
|
for line in table_lines:
|
|||
|
|
th_list = line.xpath(".//th")
|
|||
|
|
td_list = line.xpath(".//td")
|
|||
|
|
line_th = []
|
|||
|
|
line_td = []
|
|||
|
|
for th in th_list:
|
|||
|
|
th_text = trim_space("".join(th.xpath(".//text()")))
|
|||
|
|
line_th.append(th_text)
|
|||
|
|
for td in td_list:
|
|||
|
|
td_text = trim_space("".join(td.xpath(".//text()")))
|
|||
|
|
line_td.append(td_text)
|
|||
|
|
if len(line_th) > 0:
|
|||
|
|
_contents.append(line_th)
|
|||
|
|
if len(line_td) > 0:
|
|||
|
|
_contents.append(line_td)
|
|||
|
|
return _contents
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_base_info(current_selector):
|
|||
|
|
base_info_line = current_selector.xpath(
|
|||
|
|
"//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div/p")
|
|||
|
|
base_info = {}
|
|||
|
|
for line in base_info_line:
|
|||
|
|
line_text = ''.join(line.xpath(".//text()"))
|
|||
|
|
label = line_text.split(':')[0].replace("\xa0", "").replace(" ", "").strip()
|
|||
|
|
value = ''.join(line_text.split(':')[1:]).strip()
|
|||
|
|
base_info[label] = trim_space(value)
|
|||
|
|
return base_info
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
count = 0
|
|||
|
|
for _path in os.listdir(target_file_path):
|
|||
|
|
if _path.endswith('.html'):
|
|||
|
|
# if _path == '15th SPSS Activation Ceremony_军情动态_全球军事态势情报数据库.html':
|
|||
|
|
# if _path == 'AEDC stands up 804th Test Group with ceremony_军情动态_全球军事态势情报数据库.html':
|
|||
|
|
count += 1
|
|||
|
|
webpage_info = {}
|
|||
|
|
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
|
|||
|
|
html_content = target_file.read().replace('\n', '')
|
|||
|
|
target_file.close()
|
|||
|
|
selector = etree.HTML(html_content)
|
|||
|
|
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
|
|||
|
|
title = ''.join(title_info).strip()
|
|||
|
|
|
|||
|
|
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
|
|||
|
|
webpage_info["url_info"] = url_info.replace("#", "")
|
|||
|
|
|
|||
|
|
base_info = parse_base_info(selector)
|
|||
|
|
if len(base_info) > 0:
|
|||
|
|
webpage_info["base_info"] = base_info
|
|||
|
|
|
|||
|
|
main_content_info = selector.xpath(
|
|||
|
|
"//div[@class='good-info-box']/div[@class='tab '][2]//div[@class='cItem active']//p//text()")
|
|||
|
|
main_content_list = []
|
|||
|
|
for mc in main_content_info:
|
|||
|
|
if '(查看原文)' in mc or '(主要内容)' in mc:
|
|||
|
|
continue
|
|||
|
|
main_content_list.append(" " + mc)
|
|||
|
|
main_content = "\n".join(main_content_list)
|
|||
|
|
|
|||
|
|
image_info = selector.xpath(
|
|||
|
|
"//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div[@class='cItem active']/div[@id='media_list']//img/@src")
|
|||
|
|
for image_uri in image_info:
|
|||
|
|
image_path = target_file_path + image_uri[1:]
|
|||
|
|
shutil.copy(image_path, image_result_path)
|
|||
|
|
images = [img.split('/')[-1] for img in image_info]
|
|||
|
|
video_info = selector.xpath("//div[@class='cItem active']/div[@id='media_list']/div[2]/div/video/@src")
|
|||
|
|
videos = [v.split('/')[-1] for v in video_info]
|
|||
|
|
source_info = selector.xpath("/html/body/div[2]/div[1]/div[2]/div[3]/div/div/div[5]/a/text()")
|
|||
|
|
source = ''.join(source_info)
|
|||
|
|
webpage_info['main_content'] = main_content.replace("'", "’")
|
|||
|
|
webpage_info['images'] = images
|
|||
|
|
webpage_info['videos'] = videos
|
|||
|
|
webpage_info['info_source_url'] = source
|
|||
|
|
|
|||
|
|
result_file_name = title + '.json'
|
|||
|
|
for _ in ['"', '/', '\\', '|', ':', '?', '<', '>', '*']:
|
|||
|
|
result_file_name = result_file_name.replace(_, "_")
|
|||
|
|
result_file_path = "{}/{}".format(result_folder_path, result_file_name)
|
|||
|
|
result_file = open(result_file_path, 'w', encoding='utf-8')
|
|||
|
|
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
|
|||
|
|
print("[No. {}] {} 写入完成".format(count, title))
|
|||
|
|
result_file.close()
|