osc/research/nuofang-db/nfm/mperson/parse_person_link.py
2025-05-28 19:16:17 +08:00

52 lines
2.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from lxml import etree
import json
import re
target_file_path = r"E:\yuxin\nuofang-data\structure\0420\webpage"
xpath_parse_dict = {
'title': "//div[@class='good-info-box']/div[@class='top cls']/span/text()",
'base_info': "//div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div[1]/p",
'leader_info': "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@id='leader_list']//tbody/tr",
'website_info': "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div",
}
def trim_space(input_str):
result = re.sub(' {2,}', ' ', input_str)
return result
def parse_person_links(current_selector):
leader_info_line = current_selector.xpath(xpath_parse_dict['leader_info'])
m_person_links = []
for i in range(len(leader_info_line)):
line = leader_info_line[i]
line_link_info = line.xpath(".//a/@href")
if line_link_info and len(line_link_info):
m_person_links.append("".join(line_link_info))
return m_person_links
if __name__ == '__main__':
count = 0
person_links = []
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path.endswith('美国联邦调查局FBI_组织结构_全球军事态势情报数据库.html'):
count += 1
webpage_info = {}
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
title_info = selector.xpath(xpath_parse_dict['title'])
title = ''.join(title_info).strip()
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
webpage_info["url_info"] = url_info.replace("#", "")
person_links += parse_person_links(selector)
person_links = list(set(person_links))
for link in person_links:
print(link)