import os import re import shutil import pymysql from lxml import etree target_file_path = r"E:\yuxin\nuofang-data\person\webpage" def trim_space(input_str): result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " ")) return result def parse_avatar_info(current_selector): img_url_info = current_selector.xpath("//table[@class='good-table table1']//img/@src") img_url = "".join(img_url_info).strip() return img_url if __name__ == '__main__': count = 0 db = pymysql.connect(host='39.101.194.63', port=23306, user='root', passwd='passok123A', db='nfm', charset='utf8mb4') cursor = db.cursor() for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path.endswith('梅家樹_军政主官_全球军事态势情报数据库.html'): count += 1 webpage_info = {} target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()") title = ''.join(title_info).strip() url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0] page_url = url_info.replace("#", "") avatar_url = parse_avatar_info(selector) person_id = page_url[37:45] if len(avatar_url) > 0: avatar_file_name = avatar_url.split("/")[-1] sql_update = "UPDATE nfm.m_person_page SET images = '{}' WHERE id = {} AND images IS NULL".format( avatar_file_name, person_id) shutil.copy(r"E:\yuxin\nuofang-data\person\webpage" + avatar_url[1:], r"E:\yuxin\nuofang-data\person\person-avatar-2") cursor.execute(sql_update) print([person_id, avatar_file_name]) db.commit() db.close()