import os from lxml import etree import json import re target_file_path = r"E:\yuxin\nuofang-data\structure\0420\webpage" xpath_parse_dict = { 'title': "//div[@class='good-info-box']/div[@class='top cls']/span/text()", 'base_info': "//div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@class='layui-row txt']/div[1]/p", 'leader_info': "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem active']/div[@id='leader_list']//tbody/tr", 'website_info': "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div", } def trim_space(input_str): result = re.sub(' {2,}', ' ', input_str) return result def parse_person_links(current_selector): leader_info_line = current_selector.xpath(xpath_parse_dict['leader_info']) m_person_links = [] for i in range(len(leader_info_line)): line = leader_info_line[i] line_link_info = line.xpath(".//a/@href") if line_link_info and len(line_link_info): m_person_links.append("".join(line_link_info)) return m_person_links if __name__ == '__main__': count = 0 person_links = [] for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path.endswith('美国联邦调查局(FBI)_组织结构_全球军事态势情报数据库.html'): count += 1 webpage_info = {} target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) title_info = selector.xpath(xpath_parse_dict['title']) title = ''.join(title_info).strip() url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0] webpage_info["url_info"] = url_info.replace("#", "") person_links += parse_person_links(selector) person_links = list(set(person_links)) for link in person_links: print(link)