import os import pymysql from lxml import etree import json import re target_file_path = r"E:\yuxin\us-congress-html\pages" def trim_space(input_str): result = re.sub(' {2,}', ' ', input_str) return result col_length = {} if __name__ == '__main__': db = pymysql.connect(host='39.101.194.63', port=23306, user='root', passwd='passok123A', db='nfm', charset='utf8mb4') cursor = db.cursor() count = 0 for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path.endswith('alma-adams-A000370.html'): count += 1 page_info = {} target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) person_id = _path[:-5].split("-")[-1] page_url = 'https://www.congress.gov/member/' + _path[:-5].replace("-" + person_id, "") + '/' + person_id page_info['id'] = person_id page_info['source'] = page_url title_info = selector.xpath("//meta[@property='og:title']/@content") birth_info = selector.xpath("//h1[@class='legDetail']/span[@class='birthdate']/text()") in_congress_info = selector.xpath("//h1[@class='legDetail']/span[2]/text()") photo_info = selector.xpath("//div[@class='overview-member-column-picture']/img/@src") page_info['title'] = ''.join(title_info).strip() page_info['birth'] = ''.join(birth_info).strip()[1:-1] # 去括号 page_info['in_congress'] = ''.join(in_congress_info).replace("|", "").strip() photo_uri = ''.join(photo_info).strip() if len(photo_uri) > 0: page_info['photo_url'] = 'https://www.congress.gov' + photo_uri page_info['photo'] = photo_uri.split('/')[-1] if 'contact' in page_info: page_info['contact'] = page_info['contact'].replace("\nContact", "") table_lines = selector.xpath( "//div[@class='overview-member-column-profile member_profile']/table[@class='standard01 nomargin']/tbody/tr") for table_line in table_lines: label = trim_space("".join(table_line.xpath("./th//text()"))).strip() if len(label) <= 0: continue value = "" value_info = table_line.xpath("./td//text()") for _ in value_info: value += trim_space(_).strip() value += '\n' label = label.lower().replace(" ", "_") page_info[label] = value.strip() # for key in page_info: # if key in col_length and col_length[key] >= len(page_info[key]): # continue # col_length[key] = len(page_info[key]) cols = [_ for _ in page_info] vals = ["'" + page_info[_].replace("'", "\\'") + "'" for _ in page_info] SQL_INSERT = "INSERT INTO nfm.congress_person ({}) VALUES ({})".format(', '.join(cols), ', '.join(vals)) print("[No. {}] {}".format(count, SQL_INSERT)) cursor.execute(SQL_INSERT) db.commit() db.close()