153 lines
6.9 KiB
Python
153 lines
6.9 KiB
Python
import os
|
||
from lxml import etree
|
||
import json
|
||
import re
|
||
|
||
target_file_path = r"E:\yuxin\nuofang-data\structure\0420\webpage"
|
||
xpath_parse_dict = {
|
||
'title': "//div[@class='good-info-box']/div[@class='top cls']/span/text()",
|
||
'base_info': "//div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div[1]/p",
|
||
'leader_info': "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@id='leader_list']//tbody/tr",
|
||
'website_info': "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div",
|
||
}
|
||
|
||
|
||
def trim_space(input_str):
|
||
result = re.sub(' {2,}', ' ', input_str)
|
||
return result
|
||
|
||
|
||
def parse_base_info(current_selector):
|
||
base_info_line = current_selector.xpath(xpath_parse_dict['base_info'])
|
||
base_info = {}
|
||
for line in base_info_line:
|
||
line_text = ''.join(line.xpath(".//text()"))
|
||
label = line_text.split(':')[0].strip()
|
||
value = ''.join(line_text.split(':')[1:]).strip()
|
||
base_info[label] = trim_space(value)
|
||
# if current_title != '' and len(base_info) > 0:
|
||
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(base_info, ensure_ascii=False)))
|
||
# pass
|
||
# else:
|
||
# print("[No. {}] {} 解析不成功".format(count, _path))
|
||
return base_info
|
||
|
||
|
||
def parse_leader_info(current_selector):
|
||
leader_info_line = current_selector.xpath(xpath_parse_dict['leader_info'])
|
||
leader_info_list = []
|
||
ths = []
|
||
for i in range(len(leader_info_line)):
|
||
leader_info = {}
|
||
line = leader_info_line[i]
|
||
if i == 0:
|
||
ths = line.xpath("./th/text()")
|
||
else:
|
||
for th in ths:
|
||
leader_info[th] = ''
|
||
tds = line.xpath("./td")
|
||
for tdi in range(len(tds)):
|
||
td_txt = ''.join(tds[tdi].xpath(".//text()"))
|
||
leader_info[ths[tdi]] = trim_space(td_txt.strip())
|
||
leader_info_list.append(leader_info)
|
||
# if current_title != '' and len(leader_info_list) > 0:
|
||
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(leader_info_list, ensure_ascii=False)))
|
||
# pass
|
||
return leader_info_list
|
||
|
||
|
||
def parse_website_info(current_selector):
|
||
tab_name_xpath = "//div[@class='good-info-box']/div[@class='tab ']/div[@id='tabNav']/p/text()"
|
||
tab_names = current_selector.xpath(tab_name_xpath)
|
||
if len(tab_names) <= 0:
|
||
return []
|
||
website_info_tab = current_selector.xpath("//div[@class='good-info-box']/div[@class='tab ']/div[@id='tabInfo']/div")
|
||
website_info = {}
|
||
for i in range(len(website_info_tab)):
|
||
tab_dict = {}
|
||
tab_name = tab_names[i]
|
||
tab_content = website_info_tab[i]
|
||
overview_box = tab_content.xpath("./dl[@class='overview box']")
|
||
if overview_box is not None and len(overview_box) > 0:
|
||
overview_info = overview_box[0].xpath(".//p/text()")
|
||
tab_dict["概述"] = trim_space(''.join(overview_info).strip())
|
||
other_box = tab_content.xpath("./div")
|
||
if other_box is not None and len(other_box) > 0:
|
||
for box in other_box:
|
||
subtitle_info = box.xpath(".//h2/text()")
|
||
if len(subtitle_info) > 0:
|
||
subtitle = ''.join(subtitle_info).replace(":", "").strip()
|
||
if subtitle not in ['入驻单位', '主要设施']:
|
||
content_info = box.xpath(".//text()")
|
||
content = ''.join([_.strip() for _ in content_info])
|
||
tab_dict[subtitle] = trim_space(content)
|
||
website_info[tab_name] = tab_dict
|
||
# if current_title != '' and len(website_info) > 0:
|
||
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(website_info, ensure_ascii=False)))
|
||
# pass
|
||
return website_info
|
||
|
||
|
||
def parse_contact_info(current_selector):
|
||
tab_name_xpath = "//div[@class='good-info-box']/div[@id='tabContact']/div[@class='tabNav nav2 cls']/p/text()"
|
||
tab_names = current_selector.xpath(tab_name_xpath)
|
||
if len(tab_names) <= 0:
|
||
return []
|
||
contact_info_line = current_selector.xpath(
|
||
"//div[@class='good-info-box']/div[@id='tabContact']/div[@id='tabContactText']/div//p")
|
||
contact_info = {}
|
||
for i in range(len(contact_info_line)):
|
||
contact_item = contact_info_line[i]
|
||
contact_item_text = "".join(contact_item.xpath(".//text()"))
|
||
contact_item_name = contact_item_text.split(":")[0]
|
||
contact_item_value = "".join(contact_item_text.split(":")[1:])
|
||
contact_info[contact_item_name] = contact_item_value
|
||
# if current_title != '' and len(website_info) > 0:
|
||
# print("[No. {}] {}: {}".format(count, current_title, json.dumps(website_info, ensure_ascii=False)))
|
||
# pass
|
||
return contact_info
|
||
|
||
|
||
if __name__ == '__main__':
|
||
count = 0
|
||
for _path in os.listdir(target_file_path):
|
||
if _path.endswith('.html'):
|
||
# if _path.endswith('美国联邦调查局(FBI)_组织结构_全球军事态势情报数据库.html'):
|
||
count += 1
|
||
webpage_info = {}
|
||
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
|
||
html_content = target_file.read().replace('\n', '')
|
||
target_file.close()
|
||
selector = etree.HTML(html_content)
|
||
title_info = selector.xpath(xpath_parse_dict['title'])
|
||
title = ''.join(title_info).strip()
|
||
|
||
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
|
||
webpage_info["url_info"] = url_info.replace("#", "")
|
||
|
||
# demo_info = parse_contact_info(selector)
|
||
# if demo_info is not None and len(demo_info) > 0:
|
||
# print("[No. {}] {} 读取完成,{}".format(count, title, json.dumps(demo_info, ensure_ascii=False)))
|
||
# pass
|
||
# else:
|
||
# print("[No. {}] {} 内容为空".format(count, title))
|
||
base_info = parse_base_info(selector)
|
||
if len(base_info) > 0:
|
||
webpage_info["base_info"] = base_info
|
||
leader_info = parse_leader_info(selector)
|
||
if len(leader_info) > 0:
|
||
webpage_info["leader_info"] = leader_info
|
||
website_info = parse_website_info(selector)
|
||
if len(website_info) > 0:
|
||
webpage_info["website_info"] = website_info
|
||
contact_info = parse_contact_info(selector)
|
||
if len(contact_info) > 0:
|
||
webpage_info["contact_info"] = contact_info
|
||
|
||
result_file_name = '%s-' % count + title.replace("/", "-") + '.json'
|
||
result_file_path = 'E:/yuxin/nuofang-data/structure/result0512/' + result_file_name
|
||
result_file = open(result_file_path, 'w', encoding='utf-8')
|
||
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
|
||
result_file.close()
|
||
print("[No. {}] {} 写入完成,原始url是{}".format(count, title, webpage_info["url_info"]))
|