osc/research/cmip6/parser/xml_links_parser.py

import os
import json

import pymysql
from lxml import etree

xml_links_path = r'E:/yuxin/cmip20240424/xml'
count = 0
col_length = {}

if __name__ == "__main__":
    db = pymysql.connect(host='47.113.231.200', port=28089,
                         user='root', passwd='passok123A', db='jd_data', charset='utf8mb4')
    cursor = db.cursor()

    for _path in os.listdir(xml_links_path):
        if _path.endswith('.xml'):
        # if _path.endswith('CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Omon.uo.gn.v20200630.xml'):
            instance_id = _path[:-4]
            page_info = {}
            target_file = open(xml_links_path + '/' + _path, mode='r', encoding='utf-8')
            html_content = target_file.read().encode('utf-8')
            target_file.close()
            selector = etree.HTML(html_content)
            http_server_base = selector.xpath("//service[@name='HTTPServer']")[0].attrib['base']
            datasets = selector.xpath("//dataset[contains(@name,'.nc')]")
            for dataset in datasets:
                count += 1
                item = {'instance_id': instance_id, 'http_server_base': http_server_base}
                ds_attribs = dataset.attrib
                for attrib_key in ds_attribs:
                    item[attrib_key] = ds_attribs[attrib_key]
                item['service_name'] = dataset.xpath("./servicename")[0].text
                properties = dataset.xpath("./property")
                for prop in properties:
                    prop_attrib = prop.attrib
                    prop_name = prop_attrib['name']
                    prop_value = prop_attrib['value']
                    item[prop_name] = prop_value
                variables = dataset.xpath("./variables")[0]
                vs_attrib = variables.attrib
                if 'vocabulary' in vs_attrib:
                    item['vocabulary'] = vs_attrib['vocabulary']
                variable = variables.xpath('./variable')[0]
                if 'vocabulary_name' in variable.attrib:
                    item['vocabulary_name'] = variable.attrib['vocabulary_name']
                if 'units' in variable.attrib:
                    item['vocabulary_units'] = variable.attrib['units']
                if variable.text and len(variable.text) > 0:
                    item['vocabulary_text'] = variable.text

                col_list = []
                val_list = []
                for col in item:
                    col_list.append(col)
                    val = item[col]
                    if col == 'size':
                        val_list.append(val)
                    else:
                        val_list.append("'{}'".format(val))

                SQL_INSERT = "INSERT INTO jd_data.cmip6_xml_links ({}) VALUES ({})".format(", ".join(col_list),
                                                                                           ", ".join(val_list))
                cursor.execute(SQL_INSERT)
                if item['urlpath'].startswith('http') is False:
                    # print(item['urlpath'])
                    print("[No. {}] {}".format(count, item['name']))
                # for _ in item:
                #     if _ in col_length and item[_] and col_length[_] >= len(item[_]):
                #         continue
                #     col_length[_] = len(item[_])
            db.commit()
    db.close()
init 2025-05-28 19:16:17 +08:00			`import os`
			`import json`

			`import pymysql`
			`from lxml import etree`

			`xml_links_path = r'E:/yuxin/cmip20240424/xml'`
			`count = 0`
			`col_length = {}`

			`if __name__ == "__main__":`
			`db = pymysql.connect(host='47.113.231.200', port=28089,`
			`user='root', passwd='passok123A', db='jd_data', charset='utf8mb4')`
			`cursor = db.cursor()`

			`for _path in os.listdir(xml_links_path):`
			`if _path.endswith('.xml'):`
			`# if _path.endswith('CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Omon.uo.gn.v20200630.xml'):`
			`instance_id = _path[:-4]`
			`page_info = {}`
			`target_file = open(xml_links_path + '/' + _path, mode='r', encoding='utf-8')`
			`html_content = target_file.read().encode('utf-8')`
			`target_file.close()`
			`selector = etree.HTML(html_content)`
			`http_server_base = selector.xpath("//service[@name='HTTPServer']")[0].attrib['base']`
			`datasets = selector.xpath("//dataset[contains(@name,'.nc')]")`
			`for dataset in datasets:`
			`count += 1`
			`item = {'instance_id': instance_id, 'http_server_base': http_server_base}`
			`ds_attribs = dataset.attrib`
			`for attrib_key in ds_attribs:`
			`item[attrib_key] = ds_attribs[attrib_key]`
			`item['service_name'] = dataset.xpath("./servicename")[0].text`
			`properties = dataset.xpath("./property")`
			`for prop in properties:`
			`prop_attrib = prop.attrib`
			`prop_name = prop_attrib['name']`
			`prop_value = prop_attrib['value']`
			`item[prop_name] = prop_value`
			`variables = dataset.xpath("./variables")[0]`
			`vs_attrib = variables.attrib`
			`if 'vocabulary' in vs_attrib:`
			`item['vocabulary'] = vs_attrib['vocabulary']`
			`variable = variables.xpath('./variable')[0]`
			`if 'vocabulary_name' in variable.attrib:`
			`item['vocabulary_name'] = variable.attrib['vocabulary_name']`
			`if 'units' in variable.attrib:`
			`item['vocabulary_units'] = variable.attrib['units']`
			`if variable.text and len(variable.text) > 0:`
			`item['vocabulary_text'] = variable.text`

			`col_list = []`
			`val_list = []`
			`for col in item:`
			`col_list.append(col)`
			`val = item[col]`
			`if col == 'size':`
			`val_list.append(val)`
			`else:`
			`val_list.append("'{}'".format(val))`

			`SQL_INSERT = "INSERT INTO jd_data.cmip6_xml_links ({}) VALUES ({})".format(", ".join(col_list),`
			`", ".join(val_list))`
			`cursor.execute(SQL_INSERT)`
			`if item['urlpath'].startswith('http') is False:`
			`# print(item['urlpath'])`
			`print("[No. {}] {}".format(count, item['name']))`
			`# for _ in item:`
			`# if _ in col_length and item[_] and col_length[_] >= len(item[_]):`
			`# continue`
			`# col_length[_] = len(item[_])`
			`db.commit()`
			`db.close()`