import json import os import pymysql from lxml import etree from nfm.mbase.page_parser import parse_base_info target_file_path = r"E:/yuxin/nuofang-data/base/webpage" image_path = r"E:/yuxin/nuofang-data/base/images" full_image_path = r"E:/yuxin/nuofang-data/base/full_images" if __name__ == '__main__': count = 0 db = pymysql.connect(host='39.101.194.63', port=23306, user='root', passwd='passok123A', db='nfm', charset='utf8mb4') cursor = db.cursor() for _path in os.listdir(target_file_path): if _path.endswith('.html'): # if _path.endswith('埃尔门多夫-理查森堡联合基地_综合类设施_全球军事态势情报数据库.html'): count += 1 target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8') html_content = target_file.read().replace('\n', '') target_file.close() selector = etree.HTML(html_content) base_info = parse_base_info(selector, "") internal_id = base_info['内部编号'] img_uris = selector.xpath("//div[@id='det_right']//a[@class='img']/img/@src") full_images_urls = selector.xpath("//div[@id='det_right']//a[@class='img']/@href") images = [] image = '' # print("[No. {}] {}: {} images".format(count, _path[:-23], len(full_images_urls))) for url in full_images_urls: if url.startswith("http"): print(url) img_file_name = url.split("/")[-1] images.append(img_file_name) # for uri in img_uris: # if uri.startswith("./"): # img_file_name = uri.split("/")[-1] # images.append(img_file_name) # shutil.copy(target_file_path + uri[1:], image_path) # if len(images) > 0: # image = images[0] sql_update = "UPDATE nfm.m_base_page SET full_size_images = '{}' WHERE internal_id = '{}'".format( json.dumps(images, ensure_ascii=False), internal_id) cursor.execute(sql_update) db.commit() db.close()