import os from lxml import etree import json import re target_file_path = r"E:/yuxin/nuofang-data/weapon/webpage" result_folder_path = r"E:/yuxin/nuofang-data/weapon/result" def trim_space(input_str): result = re.sub(' {2,}', ' ', input_str) return result def trim_n_space(input_str): for i in range(5): input_str = input_str.replace("\\n \\n ", "\\n ") input_str = input_str.replace("\\n :", ":") return input_str def get_table_lines(table_areas): _contents = [] for table_area in table_areas: table_lines = table_area.xpath(".//tr") for line in table_lines: th_list = line.xpath(".//th") td_list = line.xpath(".//td") line_th = [] line_td = [] for th in th_list: th_text = trim_space("".join(th.xpath(".//text()"))) line_th.append(th_text) for td in td_list: td_text = trim_space("".join(td.xpath(".//text()"))) line_td.append(td_text) if len(line_th) > 0: _contents.append(line_th) if len(line_td) > 0: _contents.append(line_td) return _contents def parse_base_info(current_selector): base_info_line = current_selector.xpath( "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/p") base_info = {} for line in base_info_line: line_text = ''.join(line.xpath(".//text()")) label = line_text.split(':')[0].strip() value = ''.join(line_text.split(':')[1:]).strip() base_info[label] = trim_space(value) return base_info def parse_website_info(current_selector): tab_name_xpath = ".//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabNav nav2 cls sourcetabNav']/p/text()" tab_names = selector.xpath(tab_name_xpath) if len(tab_names) <= 0: return website_info_tab = current_selector.xpath( "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div") website_info = {} for i in range(len(website_info_tab)): tab_dict = {} tab_name = tab_names[i] if '简氏数据' in tab_name: continue tab_content = website_info_tab[i] overview_box = tab_content.xpath("./dl[@class='overview box']") if overview_box is not None and len(overview_box) > 0: overview_info = overview_box[0].xpath(".//p/text()") tab_dict["概述"] = trim_space(''.join(overview_info).strip()) other_box = tab_content.xpath("./div") if other_box is not None and len(other_box) > 0: for box in other_box: subtitle_info = box.xpath(".//h2/text()") if len(subtitle_info) > 0: subtitle = ''.join(subtitle_info).replace(":", "").strip() if subtitle not in ['入驻单位', '主要设施']: content_info = box.xpath(".//text()") _content_info = [] for _ in content_info: _ = trim_space(_) _ = _.replace(" ", "").replace("\xa0", "").replace('"', '').strip() if len(_) > 0: if _ == ":": _content_info[-1] = "\n" + _content_info[-1] + ":" else: _ = '        ' + _ _content_info.append(_) content = '\n'.join(_content_info).strip() tab_dict[subtitle] = content website_info[tab_name] = tab_dict return website_info def parse_janes_info(current_selector): tab_xpath = "//div[@class='good-info-box']/div[@class='tab '][2]" list_tabs = current_selector.xpath(tab_xpath) janes_info = {} for current_tab in list_tabs: tab_name = ''.join(current_tab.xpath(".//p[@class='item active']/text()")).strip() if '简氏数据' in tab_name: overview_info = current_tab.xpath( ".//dl[@class='overview box']/dd[@class='txt']/div[@class='changlanguage']/p//text()") overview_text = trim_space(''.join(overview_info)) janes_info['概述'] = overview_text table_boxes = current_tab.xpath(".//div[@class='cItem active']/div[@class='box']") for box in table_boxes: box_title = "".join(box.xpath("./h2/text()")).replace(":", "").strip() if len(box_title) <= 0: continue bd_div_area = box.xpath("./div[@class='bd']") table2_area = box.xpath("./table") if len(table2_area) > 0: table2_lines_contents = get_table_lines(table2_area) janes_info[box_title] = table2_lines_contents elif len(bd_div_area) > 0: for bd_div_info in bd_div_area: table_title_spans = bd_div_info.xpath("./span[@class='tabletitle']") table_content_areas = bd_div_info.xpath("./table") data_info_item = {} if len(table_title_spans) * len(table_content_areas) > 0 \ and len(table_title_spans) == len(table_content_areas): for i in range(len(table_title_spans)): table_title_span = table_title_spans[i] table_content_area = table_content_areas[i] table_title_text = "".join(table_title_span.xpath(".//text()")).strip() table_contents = get_table_lines([table_content_area]) data_info_item[table_title_text] = table_contents janes_info[box_title] = data_info_item else: content_text = "\\n ".join([trim_space(str(_)).strip() for _ in box.xpath(".//text()")]) janes_info[box_title] = trim_n_space(content_text) return janes_info if __name__ == '__main__': count = 0 for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path == '“九头蛇-70” 火箭系统_火箭_鱼雷_全球军事态势情报数据库.html': count += 1 webpage_info = {} target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()") title = ''.join(title_info).strip() url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0] webpage_info["url_info"] = url_info.replace("#", "") base_info = parse_base_info(selector) website_info = parse_website_info(selector) janes_info = parse_janes_info(selector) image_info_raw = selector.xpath( "//div[@id='det_right']/div[@class='sub-box sub-datas']/ul/li/a[@class='img']/@href") image_info = [] for _ in image_info_raw: print(_) image_info.append(_.split("/")[-1]) webpage_info["image_info"] = image_info if len(base_info) > 0: webpage_info["base_info"] = base_info if len(website_info) > 0: webpage_info["website_info"] = website_info if len(janes_info) > 0: webpage_info["janes_info"] = janes_info result_file_name = title + '.json' for _ in ['"', '/', '\\', '|', ':', '?', '<', '>', '*']: result_file_name = result_file_name.replace(_, "_") result_file_path = "{}/{}".format(result_folder_path, result_file_name) result_file = open(result_file_path, 'w', encoding='utf-8') result_file.write(json.dumps(webpage_info, ensure_ascii=False)) print("[No. {}] {} 写入完成,原始url是{}".format(count, title, webpage_info["url_info"])) result_file.close()