52 lines
2.1 KiB
Python
52 lines
2.1 KiB
Python
|
|
import os
|
|||
|
|
from lxml import etree
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
target_file_path = r"E:\yuxin\nuofang-data\structure\0420\webpage"
|
|||
|
|
xpath_parse_dict = {
|
|||
|
|
'title': "//div[@class='good-info-box']/div[@class='top cls']/span/text()",
|
|||
|
|
'base_info': "//div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div[1]/p",
|
|||
|
|
'leader_info': "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@id='leader_list']//tbody/tr",
|
|||
|
|
'website_info': "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def trim_space(input_str):
|
|||
|
|
result = re.sub(' {2,}', ' ', input_str)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_person_links(current_selector):
|
|||
|
|
leader_info_line = current_selector.xpath(xpath_parse_dict['leader_info'])
|
|||
|
|
m_person_links = []
|
|||
|
|
for i in range(len(leader_info_line)):
|
|||
|
|
line = leader_info_line[i]
|
|||
|
|
line_link_info = line.xpath(".//a/@href")
|
|||
|
|
if line_link_info and len(line_link_info):
|
|||
|
|
m_person_links.append("".join(line_link_info))
|
|||
|
|
return m_person_links
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
count = 0
|
|||
|
|
person_links = []
|
|||
|
|
for _path in os.listdir(target_file_path):
|
|||
|
|
if _path.endswith('.html'):
|
|||
|
|
# if _path.endswith('美国联邦调查局(FBI)_组织结构_全球军事态势情报数据库.html'):
|
|||
|
|
count += 1
|
|||
|
|
webpage_info = {}
|
|||
|
|
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
|
|||
|
|
html_content = target_file.read().replace('\n', '')
|
|||
|
|
target_file.close()
|
|||
|
|
selector = etree.HTML(html_content)
|
|||
|
|
title_info = selector.xpath(xpath_parse_dict['title'])
|
|||
|
|
title = ''.join(title_info).strip()
|
|||
|
|
|
|||
|
|
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
|
|||
|
|
webpage_info["url_info"] = url_info.replace("#", "")
|
|||
|
|
person_links += parse_person_links(selector)
|
|||
|
|
person_links = list(set(person_links))
|
|||
|
|
for link in person_links:
|
|||
|
|
print(link)
|