181 lines
8.3 KiB
Python
181 lines
8.3 KiB
Python
import os
|
||
from lxml import etree
|
||
import json
|
||
import re
|
||
|
||
target_file_path = r"E:/yuxin/nuofang-data/weapon/webpage"
|
||
result_folder_path = r"E:/yuxin/nuofang-data/weapon/result"
|
||
|
||
|
||
def trim_space(input_str):
|
||
result = re.sub(' {2,}', ' ', input_str)
|
||
return result
|
||
|
||
|
||
def trim_n_space(input_str):
|
||
for i in range(5):
|
||
input_str = input_str.replace("\\n \\n ", "\\n ")
|
||
input_str = input_str.replace("\\n :", ":")
|
||
return input_str
|
||
|
||
|
||
def get_table_lines(table_areas):
|
||
_contents = []
|
||
for table_area in table_areas:
|
||
table_lines = table_area.xpath(".//tr")
|
||
for line in table_lines:
|
||
th_list = line.xpath(".//th")
|
||
td_list = line.xpath(".//td")
|
||
line_th = []
|
||
line_td = []
|
||
for th in th_list:
|
||
th_text = trim_space("".join(th.xpath(".//text()")))
|
||
line_th.append(th_text)
|
||
for td in td_list:
|
||
td_text = trim_space("".join(td.xpath(".//text()")))
|
||
line_td.append(td_text)
|
||
if len(line_th) > 0:
|
||
_contents.append(line_th)
|
||
if len(line_td) > 0:
|
||
_contents.append(line_td)
|
||
return _contents
|
||
|
||
|
||
def parse_base_info(current_selector):
|
||
base_info_line = current_selector.xpath(
|
||
"//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/p")
|
||
base_info = {}
|
||
for line in base_info_line:
|
||
line_text = ''.join(line.xpath(".//text()"))
|
||
label = line_text.split(':')[0].strip()
|
||
value = ''.join(line_text.split(':')[1:]).strip()
|
||
base_info[label] = trim_space(value)
|
||
return base_info
|
||
|
||
|
||
def parse_website_info(current_selector):
|
||
tab_name_xpath = ".//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabNav nav2 cls sourcetabNav']/p/text()"
|
||
tab_names = selector.xpath(tab_name_xpath)
|
||
if len(tab_names) <= 0:
|
||
return
|
||
website_info_tab = current_selector.xpath(
|
||
"//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div")
|
||
website_info = {}
|
||
for i in range(len(website_info_tab)):
|
||
tab_dict = {}
|
||
tab_name = tab_names[i]
|
||
if '简氏数据' in tab_name:
|
||
continue
|
||
tab_content = website_info_tab[i]
|
||
overview_box = tab_content.xpath("./dl[@class='overview box']")
|
||
if overview_box is not None and len(overview_box) > 0:
|
||
overview_info = overview_box[0].xpath(".//p/text()")
|
||
tab_dict["概述"] = trim_space(''.join(overview_info).strip())
|
||
other_box = tab_content.xpath("./div")
|
||
if other_box is not None and len(other_box) > 0:
|
||
for box in other_box:
|
||
subtitle_info = box.xpath(".//h2/text()")
|
||
if len(subtitle_info) > 0:
|
||
subtitle = ''.join(subtitle_info).replace(":", "").strip()
|
||
if subtitle not in ['入驻单位', '主要设施']:
|
||
content_info = box.xpath(".//text()")
|
||
_content_info = []
|
||
for _ in content_info:
|
||
_ = trim_space(_)
|
||
_ = _.replace(" ", "").replace("\xa0", "").replace('"', '').strip()
|
||
if len(_) > 0:
|
||
if _ == ":":
|
||
_content_info[-1] = "\n" + _content_info[-1] + ":"
|
||
else:
|
||
_ = ' ' + _
|
||
_content_info.append(_)
|
||
content = '\n'.join(_content_info).strip()
|
||
tab_dict[subtitle] = content
|
||
website_info[tab_name] = tab_dict
|
||
return website_info
|
||
|
||
|
||
def parse_janes_info(current_selector):
|
||
tab_xpath = "//div[@class='good-info-box']/div[@class='tab '][2]"
|
||
list_tabs = current_selector.xpath(tab_xpath)
|
||
janes_info = {}
|
||
for current_tab in list_tabs:
|
||
tab_name = ''.join(current_tab.xpath(".//p[@class='item active']/text()")).strip()
|
||
if '简氏数据' in tab_name:
|
||
overview_info = current_tab.xpath(
|
||
".//dl[@class='overview box']/dd[@class='txt']/div[@class='changlanguage']/p//text()")
|
||
overview_text = trim_space(''.join(overview_info))
|
||
janes_info['概述'] = overview_text
|
||
table_boxes = current_tab.xpath(".//div[@class='cItem active']/div[@class='box']")
|
||
for box in table_boxes:
|
||
box_title = "".join(box.xpath("./h2/text()")).replace(":", "").strip()
|
||
if len(box_title) <= 0:
|
||
continue
|
||
bd_div_area = box.xpath("./div[@class='bd']")
|
||
table2_area = box.xpath("./table")
|
||
if len(table2_area) > 0:
|
||
table2_lines_contents = get_table_lines(table2_area)
|
||
janes_info[box_title] = table2_lines_contents
|
||
elif len(bd_div_area) > 0:
|
||
for bd_div_info in bd_div_area:
|
||
table_title_spans = bd_div_info.xpath("./span[@class='tabletitle']")
|
||
table_content_areas = bd_div_info.xpath("./table")
|
||
data_info_item = {}
|
||
if len(table_title_spans) * len(table_content_areas) > 0 \
|
||
and len(table_title_spans) == len(table_content_areas):
|
||
for i in range(len(table_title_spans)):
|
||
table_title_span = table_title_spans[i]
|
||
table_content_area = table_content_areas[i]
|
||
table_title_text = "".join(table_title_span.xpath(".//text()")).strip()
|
||
table_contents = get_table_lines([table_content_area])
|
||
data_info_item[table_title_text] = table_contents
|
||
janes_info[box_title] = data_info_item
|
||
else:
|
||
content_text = "\\n ".join([trim_space(str(_)).strip() for _ in box.xpath(".//text()")])
|
||
janes_info[box_title] = trim_n_space(content_text)
|
||
return janes_info
|
||
|
||
|
||
if __name__ == '__main__':
|
||
count = 0
|
||
for _path in os.listdir(target_file_path):
|
||
if _path.endswith('.html'):
|
||
# if _path == '“九头蛇-70” 火箭系统_火箭_鱼雷_全球军事态势情报数据库.html':
|
||
count += 1
|
||
webpage_info = {}
|
||
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
|
||
html_content = target_file.read().replace('\n', '')
|
||
target_file.close()
|
||
selector = etree.HTML(html_content)
|
||
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
|
||
title = ''.join(title_info).strip()
|
||
|
||
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
|
||
webpage_info["url_info"] = url_info.replace("#", "")
|
||
|
||
base_info = parse_base_info(selector)
|
||
website_info = parse_website_info(selector)
|
||
janes_info = parse_janes_info(selector)
|
||
image_info_raw = selector.xpath(
|
||
"//div[@id='det_right']/div[@class='sub-box sub-datas']/ul/li/a[@class='img']/@href")
|
||
image_info = []
|
||
for _ in image_info_raw:
|
||
print(_)
|
||
image_info.append(_.split("/")[-1])
|
||
webpage_info["image_info"] = image_info
|
||
if len(base_info) > 0:
|
||
webpage_info["base_info"] = base_info
|
||
if len(website_info) > 0:
|
||
webpage_info["website_info"] = website_info
|
||
if len(janes_info) > 0:
|
||
webpage_info["janes_info"] = janes_info
|
||
|
||
result_file_name = title + '.json'
|
||
for _ in ['"', '/', '\\', '|', ':', '?', '<', '>', '*']:
|
||
result_file_name = result_file_name.replace(_, "_")
|
||
result_file_path = "{}/{}".format(result_folder_path, result_file_name)
|
||
result_file = open(result_file_path, 'w', encoding='utf-8')
|
||
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
|
||
print("[No. {}] {} 写入完成,原始url是{}".format(count, title, webpage_info["url_info"]))
|
||
result_file.close()
|