2025-05-28 19:16:17 +08:00

74 lines
3.3 KiB
Python

import os
import pymysql
from lxml import etree
import json
import re
target_file_path = r"E:\yuxin\us-congress-html\pages"
def trim_space(input_str):
result = re.sub(' {2,}', ' ', input_str)
return result
col_length = {}
if __name__ == '__main__':
db = pymysql.connect(host='39.101.194.63', port=23306,
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
cursor = db.cursor()
count = 0
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path.endswith('alma-adams-A000370.html'):
count += 1
page_info = {}
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
person_id = _path[:-5].split("-")[-1]
page_url = 'https://www.congress.gov/member/' + _path[:-5].replace("-" + person_id, "") + '/' + person_id
page_info['id'] = person_id
page_info['source'] = page_url
title_info = selector.xpath("//meta[@property='og:title']/@content")
birth_info = selector.xpath("//h1[@class='legDetail']/span[@class='birthdate']/text()")
in_congress_info = selector.xpath("//h1[@class='legDetail']/span[2]/text()")
photo_info = selector.xpath("//div[@class='overview-member-column-picture']/img/@src")
page_info['title'] = ''.join(title_info).strip()
page_info['birth'] = ''.join(birth_info).strip()[1:-1] # 去括号
page_info['in_congress'] = ''.join(in_congress_info).replace("|", "").strip()
photo_uri = ''.join(photo_info).strip()
if len(photo_uri) > 0:
page_info['photo_url'] = 'https://www.congress.gov' + photo_uri
page_info['photo'] = photo_uri.split('/')[-1]
if 'contact' in page_info:
page_info['contact'] = page_info['contact'].replace("\nContact", "")
table_lines = selector.xpath(
"//div[@class='overview-member-column-profile member_profile']/table[@class='standard01 nomargin']/tbody/tr")
for table_line in table_lines:
label = trim_space("".join(table_line.xpath("./th//text()"))).strip()
if len(label) <= 0:
continue
value = ""
value_info = table_line.xpath("./td//text()")
for _ in value_info:
value += trim_space(_).strip()
value += '\n'
label = label.lower().replace(" ", "_")
page_info[label] = value.strip()
# for key in page_info:
# if key in col_length and col_length[key] >= len(page_info[key]):
# continue
# col_length[key] = len(page_info[key])
cols = [_ for _ in page_info]
vals = ["'" + page_info[_].replace("'", "\\'") + "'" for _ in page_info]
SQL_INSERT = "INSERT INTO nfm.congress_person ({}) VALUES ({})".format(', '.join(cols), ', '.join(vals))
print("[No. {}] {}".format(count, SQL_INSERT))
cursor.execute(SQL_INSERT)
db.commit()
db.close()