osc/research/nuofang-db/nfm/mperson/parse_person_link.py

import os
from lxml import etree
import json
import re

target_file_path = r"E:\yuxin\nuofang-data\structure\0420\webpage"
xpath_parse_dict = {
    'title': "//div[@class='good-info-box']/div[@class='top cls']/span/text()",
    'base_info': "//div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem  active']/div[@class='layui-row txt']/div[1]/p",
    'leader_info': "//div[@class='good-info-box']/div[@class='tab '][1]/div[@id='tab1_info']/div[@class='cItem  active']/div[@id='leader_list']//tbody/tr",
    'website_info': "//div[@class='good-info-box']/div[@class='tab '][2]/div[@class='tabInfo sourcetabInfo']/div",
}


def trim_space(input_str):
    result = re.sub(' {2,}', ' ', input_str)
    return result


def parse_person_links(current_selector):
    leader_info_line = current_selector.xpath(xpath_parse_dict['leader_info'])
    m_person_links = []
    for i in range(len(leader_info_line)):
        line = leader_info_line[i]
        line_link_info = line.xpath(".//a/@href")
        if line_link_info and len(line_link_info):
            m_person_links.append("".join(line_link_info))
    return m_person_links


if __name__ == '__main__':
    count = 0
    person_links = []
    for _path in os.listdir(target_file_path):
        if _path.endswith('.html'):
        # if _path.endswith('美国联邦调查局（FBI）_组织结构_全球军事态势情报数据库.html'):
            count += 1
            webpage_info = {}
            target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
            html_content = target_file.read().replace('\n', '')
            target_file.close()
            selector = etree.HTML(html_content)
            title_info = selector.xpath(xpath_parse_dict['title'])
            title = ''.join(title_info).strip()

            url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
            webpage_info["url_info"] = url_info.replace("#", "")
            person_links += parse_person_links(selector)
    person_links = list(set(person_links))
    for link in person_links:
        print(link)