osc/research/nuofang-db/key-person/congress/webpage-parser.py

import os

import pymysql
from lxml import etree
import json
import re

target_file_path = r"E:\yuxin\us-congress-html\pages"


def trim_space(input_str):
    result = re.sub(' {2,}', ' ', input_str)
    return result


col_length = {}

if __name__ == '__main__':
    db = pymysql.connect(host='39.101.194.63', port=23306,
                         user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
    cursor = db.cursor()

    count = 0
    for _path in os.listdir(target_file_path):
        if _path.endswith('.html'):
            # if _path.endswith('alma-adams-A000370.html'):
            count += 1
            page_info = {}
            target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
            html_content = target_file.read().replace('\n', '')
            target_file.close()
            selector = etree.HTML(html_content)
            person_id = _path[:-5].split("-")[-1]
            page_url = 'https://www.congress.gov/member/' + _path[:-5].replace("-" + person_id, "") + '/' + person_id
            page_info['id'] = person_id
            page_info['source'] = page_url
            title_info = selector.xpath("//meta[@property='og:title']/@content")
            birth_info = selector.xpath("//h1[@class='legDetail']/span[@class='birthdate']/text()")
            in_congress_info = selector.xpath("//h1[@class='legDetail']/span[2]/text()")
            photo_info = selector.xpath("//div[@class='overview-member-column-picture']/img/@src")
            page_info['title'] = ''.join(title_info).strip()
            page_info['birth'] = ''.join(birth_info).strip()[1:-1]  # 去括号
            page_info['in_congress'] = ''.join(in_congress_info).replace("|", "").strip()
            photo_uri = ''.join(photo_info).strip()
            if len(photo_uri) > 0:
                page_info['photo_url'] = 'https://www.congress.gov' + photo_uri
                page_info['photo'] = photo_uri.split('/')[-1]
            if 'contact' in page_info:
                page_info['contact'] = page_info['contact'].replace("\nContact", "")
            table_lines = selector.xpath(
                "//div[@class='overview-member-column-profile member_profile']/table[@class='standard01 nomargin']/tbody/tr")
            for table_line in table_lines:
                label = trim_space("".join(table_line.xpath("./th//text()"))).strip()
                if len(label) <= 0:
                    continue
                value = ""
                value_info = table_line.xpath("./td//text()")
                for _ in value_info:
                    value += trim_space(_).strip()
                    value += '\n'
                label = label.lower().replace(" ", "_")
                page_info[label] = value.strip()
            # for key in page_info:
            #     if key in col_length and col_length[key] >= len(page_info[key]):
            #         continue
            #     col_length[key] = len(page_info[key])
            cols = [_ for _ in page_info]
            vals = ["'" + page_info[_].replace("'", "\\'") + "'" for _ in page_info]
            SQL_INSERT = "INSERT INTO nfm.congress_person ({}) VALUES ({})".format(', '.join(cols), ', '.join(vals))
            print("[No. {}] {}".format(count, SQL_INSERT))
            cursor.execute(SQL_INSERT)
    db.commit()
    db.close()