osc/research/nuofang-db/nfm/mperson/webpage_parser.py

import os
from lxml import etree
import json
import re

target_file_path = r"E:\yuxin\nuofang-data\person\webpage"


def trim_space(input_str):
    result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " "))
    return result


def parse_base_info(current_selector):
    base_info_line = current_selector.xpath("//table[@class='good-table table1']/tbody/tr")
    base_info = {}
    for line in base_info_line:
        label_info = line.xpath("./td[@class='b']/text()")
        value_cell = line.xpath("./td[not(@class='b')]")
        for index_label in range(len(label_info)):
            label = label_info[index_label]
            cell_info = value_cell[index_label]
            cell_text = trim_space("".join(cell_info.xpath(".//text()"))).strip()
            base_info[label] = cell_text
    return base_info


def parse_website_info(current_selector):
    website_info = {}
    info_area = current_selector.xpath("//div[@class='tabInfo']/div[@class='cItem active']")
    if info_area and len(info_area) > 0:
        update_time_info = info_area[0].xpath(".//dt//text()")
        if update_time_info and len(update_time_info) > 0:
            update_time = trim_space("".join(update_time_info))
            update_time_label = update_time.split("：")[0].strip()
            update_time_value = update_time.split("：")[1].strip()
            website_info[update_time_label] = update_time_value
        subtitles = info_area[0].xpath("./div/h2/text()")
        subcontents = []
        subcontents_area = info_area[0].xpath("./div/dl")
        for _area in subcontents_area:
            _content_info = _area.xpath("./div//text()")
            if _content_info and len(_content_info) > 0:
                paragraphs = []
                for _ in _content_info:
                    paragraphs.append(trim_space(_).strip())
                _content = "        \\n".join(paragraphs)
                subcontents.append("        " + _content)
            else:
                _source_info = _area.xpath("./dd//text()")
                if _source_info and len(_source_info) > 0:
                    _source = ";".join(_source_info)
                    subcontents.append(_source)
        for i in range(len(subtitles)):
            subtitle = subtitles[i].replace("：", "")
            subcontent = subcontents[i]
            website_info[subtitle] = subcontent
        return website_info
    else:
        return []


if __name__ == '__main__':
    count = 0
    for _path in os.listdir(target_file_path):
        if _path.endswith('.html'):
            # if _path.endswith('Yancy B. Lindsey_军政主官_全球军事态势情报数据库.html'):
            count += 1
            webpage_info = {}
            target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
            html_content = target_file.read().replace('\n', '')
            target_file.close()
            selector = etree.HTML(html_content)
            title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
            title = ''.join(title_info).strip()

            url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
            webpage_info["url_info"] = url_info.replace("#", "")

            base_info = parse_base_info(selector)
            if len(base_info) > 0:
                webpage_info["base_info"] = base_info
            website_info = parse_website_info(selector)
            if len(website_info) > 0:
                webpage_info["website_info"] = website_info

            result_file_name = title.replace("/", "-").replace('"', '').replace("'", '') + '.json'
            result_file_path = 'E:/yuxin/nuofang-data/person/result0515/' + result_file_name
            result_file = open(result_file_path, 'w', encoding='utf-8')
            result_file.write(json.dumps(webpage_info, ensure_ascii=False))
            result_file.close()
            print("[No. {}] {} 写入完成，原始url是 {}".format(count, title, webpage_info["url_info"]))