import os from urllib.parse import urlparse import pymysql from lxml import etree from nfm.mact.page_parser import parse_base_info target_file_path = r"E:/yuxin/nuofang-data/activity/webpage" if __name__ == '__main__': db = pymysql.connect(host='39.101.194.63', port=23306, user='root', passwd='passok123A', db='nfm', charset='utf8mb4') cursor = db.cursor() count = 0 for _path in os.listdir(target_file_path): if _path.endswith('.html'): count += 1 webpage_info = {} target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()") title = ''.join(title_info).strip() source_info = selector.xpath("/html/body/div[2]/div[1]/div[2]/div[3]/div/div/div[5]/a/text()") source = ''.join(source_info) domain = '' if len(source) > 0: domain = urlparse(source).netloc base_info = parse_base_info(selector) internal_id = base_info['内部编号'] SQL_UPDATE = "UPDATE nfm.m_act SET info_source_url = '{}' WHERE internal_id = '{}'".format(source, internal_id) cursor.execute(SQL_UPDATE) print("[No. {}]".format(count)) db.commit() db.close()