osc/research/nuofang-db/nfm/mweapon/page_parser.py
2025-05-28 19:16:17 +08:00

181 lines
8.3 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from lxml import etree
import json
import re
target_file_path = r"E:/yuxin/nuofang-data/weapon/webpage"
result_folder_path = r"E:/yuxin/nuofang-data/weapon/result"
def trim_space(input_str):
result = re.sub(' {2,}', ' ', input_str)
return result
def trim_n_space(input_str):
for i in range(5):
input_str = input_str.replace("\\n \\n ", "\\n ")
input_str = input_str.replace("\\n ", "")
return input_str
def get_table_lines(table_areas):
_contents = []
for table_area in table_areas:
table_lines = table_area.xpath(".//tr")
for line in table_lines:
th_list = line.xpath(".//th")
td_list = line.xpath(".//td")
line_th = []
line_td = []
for th in th_list:
th_text = trim_space("".join(th.xpath(".//text()")))
line_th.append(th_text)
for td in td_list:
td_text = trim_space("".join(td.xpath(".//text()")))
line_td.append(td_text)
if len(line_th) > 0:
_contents.append(line_th)
if len(line_td) > 0:
_contents.append(line_td)
return _contents
def parse_base_info(current_selector):
base_info_line = current_selector.xpath(
"//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/p")
base_info = {}
for line in base_info_line:
line_text = ''.join(line.xpath(".//text()"))
label = line_text.split('')[0].strip()
value = ''.join(line_text.split('')[1:]).strip()
base_info[label] = trim_space(value)
return base_info
def parse_website_info(current_selector):
tab_name_xpath = ".//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabNav nav2 cls sourcetabNav']/p/text()"
tab_names = selector.xpath(tab_name_xpath)
if len(tab_names) <= 0:
return
website_info_tab = current_selector.xpath(
"//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div")
website_info = {}
for i in range(len(website_info_tab)):
tab_dict = {}
tab_name = tab_names[i]
if '简氏数据' in tab_name:
continue
tab_content = website_info_tab[i]
overview_box = tab_content.xpath("./dl[@class='overview box']")
if overview_box is not None and len(overview_box) > 0:
overview_info = overview_box[0].xpath(".//p/text()")
tab_dict["概述"] = trim_space(''.join(overview_info).strip())
other_box = tab_content.xpath("./div")
if other_box is not None and len(other_box) > 0:
for box in other_box:
subtitle_info = box.xpath(".//h2/text()")
if len(subtitle_info) > 0:
subtitle = ''.join(subtitle_info).replace("", "").strip()
if subtitle not in ['入驻单位', '主要设施']:
content_info = box.xpath(".//text()")
_content_info = []
for _ in content_info:
_ = trim_space(_)
_ = _.replace(" ", "").replace("\xa0", "").replace('"', '').strip()
if len(_) > 0:
if _ == "":
_content_info[-1] = "\n" + _content_info[-1] + ""
else:
_ = '        ' + _
_content_info.append(_)
content = '\n'.join(_content_info).strip()
tab_dict[subtitle] = content
website_info[tab_name] = tab_dict
return website_info
def parse_janes_info(current_selector):
tab_xpath = "//div[@class='good-info-box']/div[@class='tab '][2]"
list_tabs = current_selector.xpath(tab_xpath)
janes_info = {}
for current_tab in list_tabs:
tab_name = ''.join(current_tab.xpath(".//p[@class='item active']/text()")).strip()
if '简氏数据' in tab_name:
overview_info = current_tab.xpath(
".//dl[@class='overview box']/dd[@class='txt']/div[@class='changlanguage']/p//text()")
overview_text = trim_space(''.join(overview_info))
janes_info['概述'] = overview_text
table_boxes = current_tab.xpath(".//div[@class='cItem active']/div[@class='box']")
for box in table_boxes:
box_title = "".join(box.xpath("./h2/text()")).replace("", "").strip()
if len(box_title) <= 0:
continue
bd_div_area = box.xpath("./div[@class='bd']")
table2_area = box.xpath("./table")
if len(table2_area) > 0:
table2_lines_contents = get_table_lines(table2_area)
janes_info[box_title] = table2_lines_contents
elif len(bd_div_area) > 0:
for bd_div_info in bd_div_area:
table_title_spans = bd_div_info.xpath("./span[@class='tabletitle']")
table_content_areas = bd_div_info.xpath("./table")
data_info_item = {}
if len(table_title_spans) * len(table_content_areas) > 0 \
and len(table_title_spans) == len(table_content_areas):
for i in range(len(table_title_spans)):
table_title_span = table_title_spans[i]
table_content_area = table_content_areas[i]
table_title_text = "".join(table_title_span.xpath(".//text()")).strip()
table_contents = get_table_lines([table_content_area])
data_info_item[table_title_text] = table_contents
janes_info[box_title] = data_info_item
else:
content_text = "\\n ".join([trim_space(str(_)).strip() for _ in box.xpath(".//text()")])
janes_info[box_title] = trim_n_space(content_text)
return janes_info
if __name__ == '__main__':
count = 0
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path == '“九头蛇-70” 火箭系统_火箭_鱼雷_全球军事态势情报数据库.html':
count += 1
webpage_info = {}
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
title = ''.join(title_info).strip()
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
webpage_info["url_info"] = url_info.replace("#", "")
base_info = parse_base_info(selector)
website_info = parse_website_info(selector)
janes_info = parse_janes_info(selector)
image_info_raw = selector.xpath(
"//div[@id='det_right']/div[@class='sub-box sub-datas']/ul/li/a[@class='img']/@href")
image_info = []
for _ in image_info_raw:
print(_)
image_info.append(_.split("/")[-1])
webpage_info["image_info"] = image_info
if len(base_info) > 0:
webpage_info["base_info"] = base_info
if len(website_info) > 0:
webpage_info["website_info"] = website_info
if len(janes_info) > 0:
webpage_info["janes_info"] = janes_info
result_file_name = title + '.json'
for _ in ['"', '/', '\\', '|', ':', '?', '<', '>', '*']:
result_file_name = result_file_name.replace(_, "_")
result_file_path = "{}/{}".format(result_folder_path, result_file_name)
result_file = open(result_file_path, 'w', encoding='utf-8')
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
print("[No. {}] {} 写入完成原始url是{}".format(count, title, webpage_info["url_info"]))
result_file.close()