import datetime import os import pymysql from lxml import etree target_file_path = 'E:/yuxin/surfpac/pages' if __name__ == "__main__": count = 0 db = pymysql.connect(host='39.101.194.63', port=23306, user='root', passwd='passok123A', db='nfm', charset='utf8mb4') cursor = db.cursor() for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path == 'capt-aaron-j-taylor.html': count += 1 webpage_info = {} target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) main_title = ''.join(selector.xpath("//h1[@class='maintitle']/text()")).strip().replace("'", "\\'") sub_title = ''.join(selector.xpath("//h4[@class='subtitle']/text()")).strip().replace("'", "\\'") i_left = sub_title.find('(') i_right = sub_title.find(')') if i_left > 0: related_target = sub_title[i_left + 1:i_right] else: related_target = '' page_date_str = ''.join(selector.xpath("//div[@class='content-wrap']/span[@class='date']/text()")).strip() date_obj = datetime.datetime.strptime(page_date_str, '%d %B %Y') page_date = date_obj.strftime('%Y-%m-%d') page_content_info = selector.xpath("//div[@class='body']/div[@class='content-wrap']/text()") page_content_paragraph = [] for para in page_content_info: if len(para.strip()) > 0: page_content_paragraph.append(para.strip()) page_content = '\n'.join(page_content_paragraph).replace("'", "\\'") try: image_url = selector.xpath("//div[@class='image-wrapper']/img[@class='img-responsive']/@src")[0] image = image_url.split("/")[-1] except: image_url = '' image = '' try: page_url = selector.xpath("//div[@class='social']/div/@data-page-url")[0] except: page_url = '' print(f"[No. {count}] {main_title} - {related_target}") SQL_INSERT = "INSERT INTO nfm.surfpac_person (id, main_title, sub_title, image, related_target, page_date, " \ "page_date_str, page_content, image_url, page_url) " \ "VALUES ({});".format(f"{count}, '{main_title}', '{sub_title}', '{image}', " f"'{related_target}', '{page_date}', '{page_date_str}', " f"'{page_content}', '{image_url}', '{page_url}'") cursor.execute(SQL_INSERT) db.commit() db.close()