osc/research/cmip6/parser/xml_links_parser.py

74 lines
3.3 KiB
Python
Raw Normal View History

2025-05-28 19:16:17 +08:00
import os
import json
import pymysql
from lxml import etree
xml_links_path = r'E:/yuxin/cmip20240424/xml'
count = 0
col_length = {}
if __name__ == "__main__":
db = pymysql.connect(host='47.113.231.200', port=28089,
user='root', passwd='passok123A', db='jd_data', charset='utf8mb4')
cursor = db.cursor()
for _path in os.listdir(xml_links_path):
if _path.endswith('.xml'):
# if _path.endswith('CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Omon.uo.gn.v20200630.xml'):
instance_id = _path[:-4]
page_info = {}
target_file = open(xml_links_path + '/' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().encode('utf-8')
target_file.close()
selector = etree.HTML(html_content)
http_server_base = selector.xpath("//service[@name='HTTPServer']")[0].attrib['base']
datasets = selector.xpath("//dataset[contains(@name,'.nc')]")
for dataset in datasets:
count += 1
item = {'instance_id': instance_id, 'http_server_base': http_server_base}
ds_attribs = dataset.attrib
for attrib_key in ds_attribs:
item[attrib_key] = ds_attribs[attrib_key]
item['service_name'] = dataset.xpath("./servicename")[0].text
properties = dataset.xpath("./property")
for prop in properties:
prop_attrib = prop.attrib
prop_name = prop_attrib['name']
prop_value = prop_attrib['value']
item[prop_name] = prop_value
variables = dataset.xpath("./variables")[0]
vs_attrib = variables.attrib
if 'vocabulary' in vs_attrib:
item['vocabulary'] = vs_attrib['vocabulary']
variable = variables.xpath('./variable')[0]
if 'vocabulary_name' in variable.attrib:
item['vocabulary_name'] = variable.attrib['vocabulary_name']
if 'units' in variable.attrib:
item['vocabulary_units'] = variable.attrib['units']
if variable.text and len(variable.text) > 0:
item['vocabulary_text'] = variable.text
col_list = []
val_list = []
for col in item:
col_list.append(col)
val = item[col]
if col == 'size':
val_list.append(val)
else:
val_list.append("'{}'".format(val))
SQL_INSERT = "INSERT INTO jd_data.cmip6_xml_links ({}) VALUES ({})".format(", ".join(col_list),
", ".join(val_list))
cursor.execute(SQL_INSERT)
if item['urlpath'].startswith('http') is False:
# print(item['urlpath'])
print("[No. {}] {}".format(count, item['name']))
# for _ in item:
# if _ in col_length and item[_] and col_length[_] >= len(item[_]):
# continue
# col_length[_] = len(item[_])
db.commit()
db.close()