osc/research/nuofang-db/nfm/mperson/webpage_parser.py

93 lines
3.9 KiB
Python
Raw Normal View History

2025-05-28 19:16:17 +08:00
import os
from lxml import etree
import json
import re
target_file_path = r"E:\yuxin\nuofang-data\person\webpage"
def trim_space(input_str):
result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " "))
return result
def parse_base_info(current_selector):
base_info_line = current_selector.xpath("//table[@class='good-table table1']/tbody/tr")
base_info = {}
for line in base_info_line:
label_info = line.xpath("./td[@class='b']/text()")
value_cell = line.xpath("./td[not(@class='b')]")
for index_label in range(len(label_info)):
label = label_info[index_label]
cell_info = value_cell[index_label]
cell_text = trim_space("".join(cell_info.xpath(".//text()"))).strip()
base_info[label] = cell_text
return base_info
def parse_website_info(current_selector):
website_info = {}
info_area = current_selector.xpath("//div[@class='tabInfo']/div[@class='cItem active']")
if info_area and len(info_area) > 0:
update_time_info = info_area[0].xpath(".//dt//text()")
if update_time_info and len(update_time_info) > 0:
update_time = trim_space("".join(update_time_info))
update_time_label = update_time.split("")[0].strip()
update_time_value = update_time.split("")[1].strip()
website_info[update_time_label] = update_time_value
subtitles = info_area[0].xpath("./div/h2/text()")
subcontents = []
subcontents_area = info_area[0].xpath("./div/dl")
for _area in subcontents_area:
_content_info = _area.xpath("./div//text()")
if _content_info and len(_content_info) > 0:
paragraphs = []
for _ in _content_info:
paragraphs.append(trim_space(_).strip())
_content = " \\n".join(paragraphs)
subcontents.append(" " + _content)
else:
_source_info = _area.xpath("./dd//text()")
if _source_info and len(_source_info) > 0:
_source = ";".join(_source_info)
subcontents.append(_source)
for i in range(len(subtitles)):
subtitle = subtitles[i].replace("", "")
subcontent = subcontents[i]
website_info[subtitle] = subcontent
return website_info
else:
return []
if __name__ == '__main__':
count = 0
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path.endswith('Yancy B. Lindsey_军政主官_全球军事态势情报数据库.html'):
count += 1
webpage_info = {}
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
title = ''.join(title_info).strip()
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
webpage_info["url_info"] = url_info.replace("#", "")
base_info = parse_base_info(selector)
if len(base_info) > 0:
webpage_info["base_info"] = base_info
website_info = parse_website_info(selector)
if len(website_info) > 0:
webpage_info["website_info"] = website_info
result_file_name = title.replace("/", "-").replace('"', '').replace("'", '') + '.json'
result_file_path = 'E:/yuxin/nuofang-data/person/result0515/' + result_file_name
result_file = open(result_file_path, 'w', encoding='utf-8')
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
result_file.close()
print("[No. {}] {} 写入完成原始url是 {}".format(count, title, webpage_info["url_info"]))