74 lines
3.3 KiB
Python
74 lines
3.3 KiB
Python
import os
|
|
import json
|
|
|
|
import pymysql
|
|
from lxml import etree
|
|
|
|
xml_links_path = r'E:/yuxin/cmip20240424/xml'
|
|
count = 0
|
|
col_length = {}
|
|
|
|
if __name__ == "__main__":
|
|
db = pymysql.connect(host='47.113.231.200', port=28089,
|
|
user='root', passwd='passok123A', db='jd_data', charset='utf8mb4')
|
|
cursor = db.cursor()
|
|
|
|
for _path in os.listdir(xml_links_path):
|
|
if _path.endswith('.xml'):
|
|
# if _path.endswith('CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Omon.uo.gn.v20200630.xml'):
|
|
instance_id = _path[:-4]
|
|
page_info = {}
|
|
target_file = open(xml_links_path + '/' + _path, mode='r', encoding='utf-8')
|
|
html_content = target_file.read().encode('utf-8')
|
|
target_file.close()
|
|
selector = etree.HTML(html_content)
|
|
http_server_base = selector.xpath("//service[@name='HTTPServer']")[0].attrib['base']
|
|
datasets = selector.xpath("//dataset[contains(@name,'.nc')]")
|
|
for dataset in datasets:
|
|
count += 1
|
|
item = {'instance_id': instance_id, 'http_server_base': http_server_base}
|
|
ds_attribs = dataset.attrib
|
|
for attrib_key in ds_attribs:
|
|
item[attrib_key] = ds_attribs[attrib_key]
|
|
item['service_name'] = dataset.xpath("./servicename")[0].text
|
|
properties = dataset.xpath("./property")
|
|
for prop in properties:
|
|
prop_attrib = prop.attrib
|
|
prop_name = prop_attrib['name']
|
|
prop_value = prop_attrib['value']
|
|
item[prop_name] = prop_value
|
|
variables = dataset.xpath("./variables")[0]
|
|
vs_attrib = variables.attrib
|
|
if 'vocabulary' in vs_attrib:
|
|
item['vocabulary'] = vs_attrib['vocabulary']
|
|
variable = variables.xpath('./variable')[0]
|
|
if 'vocabulary_name' in variable.attrib:
|
|
item['vocabulary_name'] = variable.attrib['vocabulary_name']
|
|
if 'units' in variable.attrib:
|
|
item['vocabulary_units'] = variable.attrib['units']
|
|
if variable.text and len(variable.text) > 0:
|
|
item['vocabulary_text'] = variable.text
|
|
|
|
col_list = []
|
|
val_list = []
|
|
for col in item:
|
|
col_list.append(col)
|
|
val = item[col]
|
|
if col == 'size':
|
|
val_list.append(val)
|
|
else:
|
|
val_list.append("'{}'".format(val))
|
|
|
|
SQL_INSERT = "INSERT INTO jd_data.cmip6_xml_links ({}) VALUES ({})".format(", ".join(col_list),
|
|
", ".join(val_list))
|
|
cursor.execute(SQL_INSERT)
|
|
if item['urlpath'].startswith('http') is False:
|
|
# print(item['urlpath'])
|
|
print("[No. {}] {}".format(count, item['name']))
|
|
# for _ in item:
|
|
# if _ in col_length and item[_] and col_length[_] >= len(item[_]):
|
|
# continue
|
|
# col_length[_] = len(item[_])
|
|
db.commit()
|
|
db.close()
|