import os import shutil from lxml import etree import json import re target_file_path = r"E:/yuxin/nuofang-data/activity/webpage" result_folder_path = r"E:/yuxin/nuofang-data/activity/result" image_result_path = r'E:/yuxin/nuofang-data/activity/images' def trim_space(input_str): result = re.sub(' {2,}', ' ', input_str) return result def trim_n_space(input_str): for i in range(5): input_str = input_str.replace("\\n \\n ", "\\n ") input_str = input_str.replace("\\n :", ":") return input_str def get_table_lines(table_areas): _contents = [] for table_area in table_areas: table_lines = table_area.xpath(".//tr") for line in table_lines: th_list = line.xpath(".//th") td_list = line.xpath(".//td") line_th = [] line_td = [] for th in th_list: th_text = trim_space("".join(th.xpath(".//text()"))) line_th.append(th_text) for td in td_list: td_text = trim_space("".join(td.xpath(".//text()"))) line_td.append(td_text) if len(line_th) > 0: _contents.append(line_th) if len(line_td) > 0: _contents.append(line_td) return _contents def parse_base_info(current_selector): base_info_line = current_selector.xpath( "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div/p") base_info = {} for line in base_info_line: line_text = ''.join(line.xpath(".//text()")) label = line_text.split(':')[0].replace("\xa0", "").replace(" ", "").strip() value = ''.join(line_text.split(':')[1:]).strip() base_info[label] = trim_space(value) return base_info if __name__ == '__main__': count = 0 for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path == '15th SPSS Activation Ceremony_军情动态_全球军事态势情报数据库.html': # if _path == 'AEDC stands up 804th Test Group with ceremony_军情动态_全球军事态势情报数据库.html': count += 1 webpage_info = {} target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()") title = ''.join(title_info).strip() url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0] webpage_info["url_info"] = url_info.replace("#", "") base_info = parse_base_info(selector) if len(base_info) > 0: webpage_info["base_info"] = base_info main_content_info = selector.xpath( "//div[@class='good-info-box']/div[@class='tab '][2]//div[@class='cItem active']//p//text()") main_content_list = [] for mc in main_content_info: if '(查看原文)' in mc or '(主要内容)' in mc: continue main_content_list.append(" " + mc) main_content = "\n".join(main_content_list) image_info = selector.xpath( "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div[@class='cItem active']/div[@id='media_list']//img/@src") for image_uri in image_info: image_path = target_file_path + image_uri[1:] shutil.copy(image_path, image_result_path) images = [img.split('/')[-1] for img in image_info] video_info = selector.xpath("//div[@class='cItem active']/div[@id='media_list']/div[2]/div/video/@src") videos = [v.split('/')[-1] for v in video_info] source_info = selector.xpath("/html/body/div[2]/div[1]/div[2]/div[3]/div/div/div[5]/a/text()") source = ''.join(source_info) webpage_info['main_content'] = main_content.replace("'", "’") webpage_info['images'] = images webpage_info['videos'] = videos webpage_info['info_source_url'] = source result_file_name = title + '.json' for _ in ['"', '/', '\\', '|', ':', '?', '<', '>', '*']: result_file_name = result_file_name.replace(_, "_") result_file_path = "{}/{}".format(result_folder_path, result_file_name) result_file = open(result_file_path, 'w', encoding='utf-8') result_file.write(json.dumps(webpage_info, ensure_ascii=False)) print("[No. {}] {} 写入完成".format(count, title)) result_file.close()