import os from lxml import etree import json import re target_file_path = r"E:\yuxin\nuofang-data\person\webpage" def trim_space(input_str): result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " ")) return result def parse_base_info(current_selector): base_info_line = current_selector.xpath("//table[@class='good-table table1']/tbody/tr") base_info = {} for line in base_info_line: label_info = line.xpath("./td[@class='b']/text()") value_cell = line.xpath("./td[not(@class='b')]") for index_label in range(len(label_info)): label = label_info[index_label] cell_info = value_cell[index_label] cell_text = trim_space("".join(cell_info.xpath(".//text()"))).strip() base_info[label] = cell_text return base_info def parse_website_info(current_selector): website_info = {} info_area = current_selector.xpath("//div[@class='tabInfo']/div[@class='cItem active']") if info_area and len(info_area) > 0: update_time_info = info_area[0].xpath(".//dt//text()") if update_time_info and len(update_time_info) > 0: update_time = trim_space("".join(update_time_info)) update_time_label = update_time.split(":")[0].strip() update_time_value = update_time.split(":")[1].strip() website_info[update_time_label] = update_time_value subtitles = info_area[0].xpath("./div/h2/text()") subcontents = [] subcontents_area = info_area[0].xpath("./div/dl") for _area in subcontents_area: _content_info = _area.xpath("./div//text()") if _content_info and len(_content_info) > 0: paragraphs = [] for _ in _content_info: paragraphs.append(trim_space(_).strip()) _content = " \\n".join(paragraphs) subcontents.append(" " + _content) else: _source_info = _area.xpath("./dd//text()") if _source_info and len(_source_info) > 0: _source = ";".join(_source_info) subcontents.append(_source) for i in range(len(subtitles)): subtitle = subtitles[i].replace(":", "") subcontent = subcontents[i] website_info[subtitle] = subcontent return website_info else: return [] if __name__ == '__main__': count = 0 for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path.endswith('Yancy B. Lindsey_军政主官_全球军事态势情报数据库.html'): count += 1 webpage_info = {} target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()") title = ''.join(title_info).strip() url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0] webpage_info["url_info"] = url_info.replace("#", "") base_info = parse_base_info(selector) if len(base_info) > 0: webpage_info["base_info"] = base_info website_info = parse_website_info(selector) if len(website_info) > 0: webpage_info["website_info"] = website_info result_file_name = title.replace("/", "-").replace('"', '').replace("'", '') + '.json' result_file_path = 'E:/yuxin/nuofang-data/person/result0515/' + result_file_name result_file = open(result_file_path, 'w', encoding='utf-8') result_file.write(json.dumps(webpage_info, ensure_ascii=False)) result_file.close() print("[No. {}] {} 写入完成,原始url是 {}".format(count, title, webpage_info["url_info"]))