74 lines
3.3 KiB
Python
74 lines
3.3 KiB
Python
import os
|
|
|
|
import pymysql
|
|
from lxml import etree
|
|
import json
|
|
import re
|
|
|
|
target_file_path = r"E:\yuxin\us-congress-html\pages"
|
|
|
|
|
|
def trim_space(input_str):
|
|
result = re.sub(' {2,}', ' ', input_str)
|
|
return result
|
|
|
|
|
|
col_length = {}
|
|
|
|
if __name__ == '__main__':
|
|
db = pymysql.connect(host='39.101.194.63', port=23306,
|
|
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
|
|
cursor = db.cursor()
|
|
|
|
count = 0
|
|
for _path in os.listdir(target_file_path):
|
|
if _path.endswith('.html'):
|
|
# if _path.endswith('alma-adams-A000370.html'):
|
|
count += 1
|
|
page_info = {}
|
|
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
|
|
html_content = target_file.read().replace('\n', '')
|
|
target_file.close()
|
|
selector = etree.HTML(html_content)
|
|
person_id = _path[:-5].split("-")[-1]
|
|
page_url = 'https://www.congress.gov/member/' + _path[:-5].replace("-" + person_id, "") + '/' + person_id
|
|
page_info['id'] = person_id
|
|
page_info['source'] = page_url
|
|
title_info = selector.xpath("//meta[@property='og:title']/@content")
|
|
birth_info = selector.xpath("//h1[@class='legDetail']/span[@class='birthdate']/text()")
|
|
in_congress_info = selector.xpath("//h1[@class='legDetail']/span[2]/text()")
|
|
photo_info = selector.xpath("//div[@class='overview-member-column-picture']/img/@src")
|
|
page_info['title'] = ''.join(title_info).strip()
|
|
page_info['birth'] = ''.join(birth_info).strip()[1:-1] # 去括号
|
|
page_info['in_congress'] = ''.join(in_congress_info).replace("|", "").strip()
|
|
photo_uri = ''.join(photo_info).strip()
|
|
if len(photo_uri) > 0:
|
|
page_info['photo_url'] = 'https://www.congress.gov' + photo_uri
|
|
page_info['photo'] = photo_uri.split('/')[-1]
|
|
if 'contact' in page_info:
|
|
page_info['contact'] = page_info['contact'].replace("\nContact", "")
|
|
table_lines = selector.xpath(
|
|
"//div[@class='overview-member-column-profile member_profile']/table[@class='standard01 nomargin']/tbody/tr")
|
|
for table_line in table_lines:
|
|
label = trim_space("".join(table_line.xpath("./th//text()"))).strip()
|
|
if len(label) <= 0:
|
|
continue
|
|
value = ""
|
|
value_info = table_line.xpath("./td//text()")
|
|
for _ in value_info:
|
|
value += trim_space(_).strip()
|
|
value += '\n'
|
|
label = label.lower().replace(" ", "_")
|
|
page_info[label] = value.strip()
|
|
# for key in page_info:
|
|
# if key in col_length and col_length[key] >= len(page_info[key]):
|
|
# continue
|
|
# col_length[key] = len(page_info[key])
|
|
cols = [_ for _ in page_info]
|
|
vals = ["'" + page_info[_].replace("'", "\\'") + "'" for _ in page_info]
|
|
SQL_INSERT = "INSERT INTO nfm.congress_person ({}) VALUES ({})".format(', '.join(cols), ', '.join(vals))
|
|
print("[No. {}] {}".format(count, SQL_INSERT))
|
|
cursor.execute(SQL_INSERT)
|
|
db.commit()
|
|
db.close()
|