import os from lxml import etree import json import re target_file_path = r"E:\yuxin\nuofang-data\base\webpage" xpath_parse_dict = { 'title': "//div[@class='good-info-box']/div[@class='top cls']/span/text()", 'base_info': "//div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div[1]/p", 'leader_info': "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@id='leader_list']//tbody/tr", 'website_info': "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div", } def trim_space(input_str): result = re.sub(' {2,}', ' ', input_str) return result def parse_base_info(current_selector, current_title): base_info_line = current_selector.xpath(xpath_parse_dict['base_info']) base_info = {} for line in base_info_line: line_text = ''.join(line.xpath(".//text()")) label = line_text.split(':')[0].strip() value = ''.join(line_text.split(':')[1:]).strip() base_info[label] = trim_space(value) # if current_title != '' and len(base_info) > 0: # print("[No. {}] {}: {}".format(count, current_title, json.dumps(base_info, ensure_ascii=False))) # pass # else: # print("[No. {}] {} 解析不成功".format(count, _path)) return base_info def parse_leader_info(current_selector, current_title): leader_info_line = current_selector.xpath(xpath_parse_dict['leader_info']) leader_info_list = [] ths = [] for i in range(len(leader_info_line)): leader_info = {} line = leader_info_line[i] if i == 0: ths = line.xpath("./th/text()") else: for th in ths: leader_info[th] = '' tds = line.xpath("./td") for tdi in range(len(tds)): td_txt = ''.join(tds[tdi].xpath(".//text()")) leader_info[ths[tdi]] = trim_space(td_txt.strip()) leader_info_list.append(leader_info) # if current_title != '' and len(leader_info_list) > 0: # print("[No. {}] {}: {}".format(count, current_title, json.dumps(leader_info_list, ensure_ascii=False))) # pass return leader_info_list def parse_website_info(current_selector, current_title): tab_name_xpath = ".//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabNav nav2 cls sourcetabNav']/p/text()" tab_names = selector.xpath(tab_name_xpath) if len(tab_names) <= 0: return website_info_tab = current_selector.xpath(xpath_parse_dict['website_info']) website_info = {} for i in range(len(website_info_tab)): tab_dict = {} tab_name = tab_names[i] tab_content = website_info_tab[i] overview_box = tab_content.xpath("./dl[@class='overview box']") if overview_box is not None and len(overview_box) > 0: overview_info = overview_box[0].xpath(".//p/text()") tab_dict["概述"] = trim_space(''.join(overview_info).strip()) other_box = tab_content.xpath("./div") if other_box is not None and len(other_box) > 0: for box in other_box: subtitle_info = box.xpath(".//h2/text()") if len(subtitle_info) > 0: subtitle = ''.join(subtitle_info).replace(":", "").strip() if subtitle not in ['入驻单位', '主要设施']: content_info = box.xpath(".//text()") _content_info = [] for _ in content_info: _ = trim_space(_) _ = _.replace(" ", "").replace("\xa0", "").replace('"','').strip() if len(_) > 0: if _ == ":": _content_info[-1] = "\n" + _content_info[-1] + ":" else: _ = '        ' + _ _content_info.append(_) content = '\n'.join(_content_info).strip() tab_dict[subtitle] = content website_info[tab_name] = tab_dict # if current_title != '' and len(website_info) > 0: # print("[No. {}] {}: {}".format(count, current_title, json.dumps(website_info, ensure_ascii=False))) # pass return website_info def parse_faa_info(current_selector, current_title): tab_xpath = "//div[@class='good-info-box']/div[@class='tab ']" list_tabs = current_selector.xpath(tab_xpath) faa_info = {} for current_tab in list_tabs: tab_name = ''.join(current_tab.xpath(".//p[@class='item active']/text()")).strip() if 'FAA info' in tab_name: faa_data_content = current_tab.xpath( "./div[@class='tabInfo sourcetabInfo']/div[@class='cItem active']/div[@id='data_info_list1']") data_titles = faa_data_content[0].xpath("./div[@class='bd']/span[@class='tabletitle']") data_tables = faa_data_content[0].xpath("./div[@class='bd']/table[@class='good-table table1']") for i in range(len(data_tables)): data_title = data_titles[i] data_table = data_tables[i] trs = data_table.xpath("./tbody/tr") item = {} current_item_title = '' current_item_content = {} for tr in trs: th_info = tr.xpath("./th/text()") td1_info = tr.xpath("./td[1]//text()") td2_info = tr.xpath("./td[2]//text()") if th_info is not None and len(th_info) > 0: if len(current_item_title) > 0: item[current_item_title] = current_item_content current_item_title = ''.join(th_info).strip() current_item_content = {} else: td1 = ''.join(td1_info).strip() td2 = ''.join(td2_info).strip() current_item_content[td1] = trim_space(td2) item[current_item_title] = current_item_content data_title_name = ''.join(data_title.xpath(".//text()")).strip() faa_info[data_title_name] = item # if current_title != '' and len(faa_info) > 0: # print("[No. {}] {}: {}".format(count, current_title, json.dumps(faa_info, ensure_ascii=False))) # pass return faa_info def parse_facility_info(current_selector, current_title): facility_area_xpath = "//div[@class='good-info-box']//div[@id='militarybase_main_facilities1']" facility_areas = current_selector.xpath(facility_area_xpath) facility_info_list = [] for facility_area in facility_areas: facility_table = facility_area.xpath("./table") if len(facility_table) > 0: trs = facility_table[0].xpath("./tbody/tr") ths = [] for i in range(len(trs)): tr = trs[i] if i == 0: ths = tr.xpath("./th/text()") else: td1 = ''.join([_.strip() for _ in tr.xpath("./td[1]//text()")]).strip() td2 = ''.join([_.strip() for _ in tr.xpath("./td[2]//text()")]).strip() facility_info_list.append({ ths[0]: trim_space(td1), ths[1]: trim_space(td2), }) # if current_title != '' and len(facility_info_list) > 0: # print("[No. {}] {}: {}".format(count, current_title, json.dumps(facility_info_list, ensure_ascii=False))) # pass return facility_info_list def parse_unit_info(current_selector, current_title): unit_info_xpath = "//div[@id='militarybase_department_list1']" unit_info_areas = current_selector.xpath(unit_info_xpath) unit_info_list = [] for unit_info_area in unit_info_areas: unit_info_table = unit_info_area.xpath("./table") if len(unit_info_table) > 0: trs = unit_info_table[0].xpath("./tbody/tr") ths = [] for i in range(len(trs)): tr = trs[i] if i == 0: ths = tr.xpath("./th/text()") else: try: td1 = tr.xpath("./td[1]//text()")[0] td2 = ''.join([_.strip() for _ in tr.xpath("./td[2]//text()")]).strip() style_info = tr.xpath("./td[2]/span/@style") style = ''.join(style_info).strip()[12:-3] tab_number = int(style) / 24 except (IndexError, ValueError): td1 = "单位" td2 = ''.join([_.strip() for _ in tr.xpath("./td[2]//text()")]).strip() if len(td2) == 0: td2 = ''.join([_.strip() for _ in tr.xpath("./td[1]//text()")]).strip() tab_number = 1 if tab_number == 1: unit_info_list.append({ ths[0]: trim_space(td1), ths[1]: trim_space(td2), "child": [] }) elif tab_number == 2: _child = unit_info_list[-1]["child"] _child.append({ ths[0]: trim_space(td1), ths[1]: trim_space(td2), "child": [] }) unit_info_list[-1]["child"] = _child elif tab_number == 3: _child = unit_info_list[-1]["child"][-1]["child"] _child.append({ ths[0]: trim_space(td1), ths[1]: trim_space(td2), "child": [] }) unit_info_list[-1]["child"][-1]["child"] = _child # if current_title != '' and len(unit_info_list) > 0: # print("[No. {}] {}: {}".format(count, current_title, json.dumps(unit_info_list, ensure_ascii=False))) # pass return unit_info_list if __name__ == '__main__': count = 0 for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path.endswith('横须贺舰队设施机构_综合类设施_全球军事态势情报数据库.html'): count += 1 webpage_info = {} target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) title_info = selector.xpath(xpath_parse_dict['title']) title = ''.join(title_info).strip() url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0] webpage_info["url_info"] = url_info.replace("#", "") base_info = parse_base_info(selector, title) leader_info = parse_leader_info(selector, title) website_info = parse_website_info(selector, title) faa_info = parse_faa_info(selector, title) facility_info = parse_facility_info(selector, title) unit_info = parse_unit_info(selector, title) if len(base_info) > 0: webpage_info["base_info"] = base_info if len(leader_info) > 0: webpage_info["leader_info"] = leader_info if len(website_info) > 0: webpage_info["website_info"] = website_info if len(faa_info) > 0: webpage_info["faa_info"] = faa_info if len(facility_info) > 0: webpage_info["facility_info"] = facility_info if len(unit_info) > 0: webpage_info["unit_info"] = unit_info result_file_name = title + '.json' result_file_path = 'E:/yuxin/nuofang-data/base/result0513/' + result_file_name result_file = open(result_file_path, 'w', encoding='utf-8') result_file.write(json.dumps(webpage_info, ensure_ascii=False)) print("[No. {}] {} 写入完成,原始url是{}".format(count, title, webpage_info["url_info"])) result_file.close()