import json import os import shutil import pymysql from lxml import etree from nfm.mstruct.webpage_parser import parse_base_info target_file_path = r"E:/yuxin/nuofang-data/structure/0420/webpage" images_path = r"E:/yuxin/nuofang-data/structure/0420/images" if __name__ == '__main__': count = 0 db = pymysql.connect(host='39.101.194.63', port=23306, user='root', passwd='passok123A', db='nfm', charset='utf8mb4') cursor = db.cursor() internet_image_urls = [] for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path.endswith('美国国防部(DoD)_组织结构_全球军事态势情报数据库.html'): count += 1 target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) base_info = parse_base_info(selector) internal_id = base_info['内部编号'] images_uris = selector.xpath("//div[@id='det_right']//a[@class='img']/@href") images_file_name = '' images_file_list = [] if len(images_uris) > 0: for image_uri in images_uris: images_file_name = image_uri.split("/")[-1] images_file_list.append(images_file_name) if image_uri.startswith("./"): image_source_path = target_file_path + image_uri[1:] shutil.copy(image_source_path, images_path + "/" + images_file_name) else: if image_uri.startswith("http"): internet_image_urls.append(image_uri) print( "[No. {}] {}: images - {}".format(count, internal_id, json.dumps(images_file_list, ensure_ascii=False))) sql_update = "UPDATE nfm.m_struct_page SET images = '{}' WHERE internal_code = '{}'".format( json.dumps(images_file_list, ensure_ascii=False), internal_id) cursor.execute(sql_update) db.commit() db.close() for url in internet_image_urls: print(url)