osc/research/nuofang-db/nfm/mact/page_parser.py
2025-05-28 19:16:17 +08:00

113 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import shutil
from lxml import etree
import json
import re
target_file_path = r"E:/yuxin/nuofang-data/activity/webpage"
result_folder_path = r"E:/yuxin/nuofang-data/activity/result"
image_result_path = r'E:/yuxin/nuofang-data/activity/images'
def trim_space(input_str):
result = re.sub(' {2,}', ' ', input_str)
return result
def trim_n_space(input_str):
for i in range(5):
input_str = input_str.replace("\\n \\n ", "\\n ")
input_str = input_str.replace("\\n ", "")
return input_str
def get_table_lines(table_areas):
_contents = []
for table_area in table_areas:
table_lines = table_area.xpath(".//tr")
for line in table_lines:
th_list = line.xpath(".//th")
td_list = line.xpath(".//td")
line_th = []
line_td = []
for th in th_list:
th_text = trim_space("".join(th.xpath(".//text()")))
line_th.append(th_text)
for td in td_list:
td_text = trim_space("".join(td.xpath(".//text()")))
line_td.append(td_text)
if len(line_th) > 0:
_contents.append(line_th)
if len(line_td) > 0:
_contents.append(line_td)
return _contents
def parse_base_info(current_selector):
base_info_line = current_selector.xpath(
"//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div/p")
base_info = {}
for line in base_info_line:
line_text = ''.join(line.xpath(".//text()"))
label = line_text.split('')[0].replace("\xa0", "").replace(" ", "").strip()
value = ''.join(line_text.split('')[1:]).strip()
base_info[label] = trim_space(value)
return base_info
if __name__ == '__main__':
count = 0
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path == '15th SPSS Activation Ceremony_军情动态_全球军事态势情报数据库.html':
# if _path == 'AEDC stands up 804th Test Group with ceremony_军情动态_全球军事态势情报数据库.html':
count += 1
webpage_info = {}
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
title = ''.join(title_info).strip()
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
webpage_info["url_info"] = url_info.replace("#", "")
base_info = parse_base_info(selector)
if len(base_info) > 0:
webpage_info["base_info"] = base_info
main_content_info = selector.xpath(
"//div[@class='good-info-box']/div[@class='tab '][2]//div[@class='cItem active']//p//text()")
main_content_list = []
for mc in main_content_info:
if '(查看原文)' in mc or '(主要内容)' in mc:
continue
main_content_list.append(" " + mc)
main_content = "\n".join(main_content_list)
image_info = selector.xpath(
"//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div[@class='cItem active']/div[@id='media_list']//img/@src")
for image_uri in image_info:
image_path = target_file_path + image_uri[1:]
shutil.copy(image_path, image_result_path)
images = [img.split('/')[-1] for img in image_info]
video_info = selector.xpath("//div[@class='cItem active']/div[@id='media_list']/div[2]/div/video/@src")
videos = [v.split('/')[-1] for v in video_info]
source_info = selector.xpath("/html/body/div[2]/div[1]/div[2]/div[3]/div/div/div[5]/a/text()")
source = ''.join(source_info)
webpage_info['main_content'] = main_content.replace("'", "")
webpage_info['images'] = images
webpage_info['videos'] = videos
webpage_info['info_source_url'] = source
result_file_name = title + '.json'
for _ in ['"', '/', '\\', '|', ':', '?', '<', '>', '*']:
result_file_name = result_file_name.replace(_, "_")
result_file_path = "{}/{}".format(result_folder_path, result_file_name)
result_file = open(result_file_path, 'w', encoding='utf-8')
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
print("[No. {}] {} 写入完成".format(count, title))
result_file.close()