import os import json import pymysql from lxml import etree xml_links_path = r'E:/yuxin/cmip20240424/xml' count = 0 col_length = {} if __name__ == "__main__": db = pymysql.connect(host='47.113.231.200', port=28089, user='root', passwd='passok123A', db='jd_data', charset='utf8mb4') cursor = db.cursor() for _path in os.listdir(xml_links_path): if _path.endswith('.xml'): # if _path.endswith('CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Omon.uo.gn.v20200630.xml'): instance_id = _path[:-4] page_info = {} target_file = open(xml_links_path + '/' + _path, mode='r', encoding='utf-8') html_content = target_file.read().encode('utf-8') target_file.close() selector = etree.HTML(html_content) http_server_base = selector.xpath("//service[@name='HTTPServer']")[0].attrib['base'] datasets = selector.xpath("//dataset[contains(@name,'.nc')]") for dataset in datasets: count += 1 item = {'instance_id': instance_id, 'http_server_base': http_server_base} ds_attribs = dataset.attrib for attrib_key in ds_attribs: item[attrib_key] = ds_attribs[attrib_key] item['service_name'] = dataset.xpath("./servicename")[0].text properties = dataset.xpath("./property") for prop in properties: prop_attrib = prop.attrib prop_name = prop_attrib['name'] prop_value = prop_attrib['value'] item[prop_name] = prop_value variables = dataset.xpath("./variables")[0] vs_attrib = variables.attrib if 'vocabulary' in vs_attrib: item['vocabulary'] = vs_attrib['vocabulary'] variable = variables.xpath('./variable')[0] if 'vocabulary_name' in variable.attrib: item['vocabulary_name'] = variable.attrib['vocabulary_name'] if 'units' in variable.attrib: item['vocabulary_units'] = variable.attrib['units'] if variable.text and len(variable.text) > 0: item['vocabulary_text'] = variable.text col_list = [] val_list = [] for col in item: col_list.append(col) val = item[col] if col == 'size': val_list.append(val) else: val_list.append("'{}'".format(val)) SQL_INSERT = "INSERT INTO jd_data.cmip6_xml_links ({}) VALUES ({})".format(", ".join(col_list), ", ".join(val_list)) cursor.execute(SQL_INSERT) if item['urlpath'].startswith('http') is False: # print(item['urlpath']) print("[No. {}] {}".format(count, item['name'])) # for _ in item: # if _ in col_length and item[_] and col_length[_] >= len(item[_]): # continue # col_length[_] = len(item[_]) db.commit() db.close()