osc/research/nuofang-db/nfm/mstruct/webpage_parser.py
2025-05-28 19:16:17 +08:00

153 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from lxml import etree
import json
import re
target_file_path = r"E:\yuxin\nuofang-data\structure\0420\webpage"
xpath_parse_dict = {
'title': "//div[@class='good-info-box']/div[@class='top cls']/span/text()",
'base_info': "//div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div[1]/p",
'leader_info': "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@id='leader_list']//tbody/tr",
'website_info': "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div",
}
def trim_space(input_str):
result = re.sub(' {2,}', ' ', input_str)
return result
def parse_base_info(current_selector):
base_info_line = current_selector.xpath(xpath_parse_dict['base_info'])
base_info = {}
for line in base_info_line:
line_text = ''.join(line.xpath(".//text()"))
label = line_text.split('')[0].strip()
value = ''.join(line_text.split('')[1:]).strip()
base_info[label] = trim_space(value)
# if current_title != '' and len(base_info) > 0:
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(base_info, ensure_ascii=False)))
# pass
# else:
# print("[No. {}] {} 解析不成功".format(count, _path))
return base_info
def parse_leader_info(current_selector):
leader_info_line = current_selector.xpath(xpath_parse_dict['leader_info'])
leader_info_list = []
ths = []
for i in range(len(leader_info_line)):
leader_info = {}
line = leader_info_line[i]
if i == 0:
ths = line.xpath("./th/text()")
else:
for th in ths:
leader_info[th] = ''
tds = line.xpath("./td")
for tdi in range(len(tds)):
td_txt = ''.join(tds[tdi].xpath(".//text()"))
leader_info[ths[tdi]] = trim_space(td_txt.strip())
leader_info_list.append(leader_info)
# if current_title != '' and len(leader_info_list) > 0:
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(leader_info_list, ensure_ascii=False)))
# pass
return leader_info_list
def parse_website_info(current_selector):
tab_name_xpath = "//div[@class='good-info-box']/div[@class='tab ']/div[@id='tabNav']/p/text()"
tab_names = current_selector.xpath(tab_name_xpath)
if len(tab_names) <= 0:
return []
website_info_tab = current_selector.xpath("//div[@class='good-info-box']/div[@class='tab ']/div[@id='tabInfo']/div")
website_info = {}
for i in range(len(website_info_tab)):
tab_dict = {}
tab_name = tab_names[i]
tab_content = website_info_tab[i]
overview_box = tab_content.xpath("./dl[@class='overview box']")
if overview_box is not None and len(overview_box) > 0:
overview_info = overview_box[0].xpath(".//p/text()")
tab_dict["概述"] = trim_space(''.join(overview_info).strip())
other_box = tab_content.xpath("./div")
if other_box is not None and len(other_box) > 0:
for box in other_box:
subtitle_info = box.xpath(".//h2/text()")
if len(subtitle_info) > 0:
subtitle = ''.join(subtitle_info).replace("", "").strip()
if subtitle not in ['入驻单位', '主要设施']:
content_info = box.xpath(".//text()")
content = ''.join([_.strip() for _ in content_info])
tab_dict[subtitle] = trim_space(content)
website_info[tab_name] = tab_dict
# if current_title != '' and len(website_info) > 0:
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(website_info, ensure_ascii=False)))
# pass
return website_info
def parse_contact_info(current_selector):
tab_name_xpath = "//div[@class='good-info-box']/div[@id='tabContact']/div[@class='tabNav nav2 cls']/p/text()"
tab_names = current_selector.xpath(tab_name_xpath)
if len(tab_names) <= 0:
return []
contact_info_line = current_selector.xpath(
"//div[@class='good-info-box']/div[@id='tabContact']/div[@id='tabContactText']/div//p")
contact_info = {}
for i in range(len(contact_info_line)):
contact_item = contact_info_line[i]
contact_item_text = "".join(contact_item.xpath(".//text()"))
contact_item_name = contact_item_text.split("")[0]
contact_item_value = "".join(contact_item_text.split("")[1:])
contact_info[contact_item_name] = contact_item_value
# if current_title != '' and len(website_info) > 0:
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(website_info, ensure_ascii=False)))
# pass
return contact_info
if __name__ == '__main__':
count = 0
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path.endswith('美国联邦调查局FBI_组织结构_全球军事态势情报数据库.html'):
count += 1
webpage_info = {}
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
title_info = selector.xpath(xpath_parse_dict['title'])
title = ''.join(title_info).strip()
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
webpage_info["url_info"] = url_info.replace("#", "")
# demo_info = parse_contact_info(selector)
# if demo_info is not None and len(demo_info) > 0:
# print("[No. {}] {} 读取完成,{}".format(count, title, json.dumps(demo_info, ensure_ascii=False)))
# pass
# else:
# print("[No. {}] {} 内容为空".format(count, title))
base_info = parse_base_info(selector)
if len(base_info) > 0:
webpage_info["base_info"] = base_info
leader_info = parse_leader_info(selector)
if len(leader_info) > 0:
webpage_info["leader_info"] = leader_info
website_info = parse_website_info(selector)
if len(website_info) > 0:
webpage_info["website_info"] = website_info
contact_info = parse_contact_info(selector)
if len(contact_info) > 0:
webpage_info["contact_info"] = contact_info
result_file_name = '%s-' % count + title.replace("/", "-") + '.json'
result_file_path = 'E:/yuxin/nuofang-data/structure/result0512/' + result_file_name
result_file = open(result_file_path, 'w', encoding='utf-8')
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
result_file.close()
print("[No. {}] {} 写入完成原始url是{}".format(count, title, webpage_info["url_info"]))