osc/research/nuofang-db/nfm/mperson/webpage_parser.py
2025-05-28 19:16:17 +08:00

93 lines
3.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from lxml import etree
import json
import re
target_file_path = r"E:\yuxin\nuofang-data\person\webpage"
def trim_space(input_str):
result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " "))
return result
def parse_base_info(current_selector):
base_info_line = current_selector.xpath("//table[@class='good-table table1']/tbody/tr")
base_info = {}
for line in base_info_line:
label_info = line.xpath("./td[@class='b']/text()")
value_cell = line.xpath("./td[not(@class='b')]")
for index_label in range(len(label_info)):
label = label_info[index_label]
cell_info = value_cell[index_label]
cell_text = trim_space("".join(cell_info.xpath(".//text()"))).strip()
base_info[label] = cell_text
return base_info
def parse_website_info(current_selector):
website_info = {}
info_area = current_selector.xpath("//div[@class='tabInfo']/div[@class='cItem active']")
if info_area and len(info_area) > 0:
update_time_info = info_area[0].xpath(".//dt//text()")
if update_time_info and len(update_time_info) > 0:
update_time = trim_space("".join(update_time_info))
update_time_label = update_time.split("")[0].strip()
update_time_value = update_time.split("")[1].strip()
website_info[update_time_label] = update_time_value
subtitles = info_area[0].xpath("./div/h2/text()")
subcontents = []
subcontents_area = info_area[0].xpath("./div/dl")
for _area in subcontents_area:
_content_info = _area.xpath("./div//text()")
if _content_info and len(_content_info) > 0:
paragraphs = []
for _ in _content_info:
paragraphs.append(trim_space(_).strip())
_content = " \\n".join(paragraphs)
subcontents.append(" " + _content)
else:
_source_info = _area.xpath("./dd//text()")
if _source_info and len(_source_info) > 0:
_source = ";".join(_source_info)
subcontents.append(_source)
for i in range(len(subtitles)):
subtitle = subtitles[i].replace("", "")
subcontent = subcontents[i]
website_info[subtitle] = subcontent
return website_info
else:
return []
if __name__ == '__main__':
count = 0
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path.endswith('Yancy B. Lindsey_军政主官_全球军事态势情报数据库.html'):
count += 1
webpage_info = {}
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
title = ''.join(title_info).strip()
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
webpage_info["url_info"] = url_info.replace("#", "")
base_info = parse_base_info(selector)
if len(base_info) > 0:
webpage_info["base_info"] = base_info
website_info = parse_website_info(selector)
if len(website_info) > 0:
webpage_info["website_info"] = website_info
result_file_name = title.replace("/", "-").replace('"', '').replace("'", '') + '.json'
result_file_path = 'E:/yuxin/nuofang-data/person/result0515/' + result_file_name
result_file = open(result_file_path, 'w', encoding='utf-8')
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
result_file.close()
print("[No. {}] {} 写入完成原始url是 {}".format(count, title, webpage_info["url_info"]))