osc/research/nuofang-db/nfm/mbase/image_parser.py

import json
import os

import pymysql
from lxml import etree

from nfm.mbase.page_parser import parse_base_info

target_file_path = r"E:/yuxin/nuofang-data/base/webpage"
image_path = r"E:/yuxin/nuofang-data/base/images"
full_image_path = r"E:/yuxin/nuofang-data/base/full_images"

if __name__ == '__main__':
    count = 0
    db = pymysql.connect(host='39.101.194.63', port=23306,
                         user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
    cursor = db.cursor()
    for _path in os.listdir(target_file_path):
        if _path.endswith('.html'):
        # if _path.endswith('埃尔门多夫-理查森堡联合基地_综合类设施_全球军事态势情报数据库.html'):
            count += 1
            target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
            html_content = target_file.read().replace('\n', '')
            target_file.close()
            selector = etree.HTML(html_content)
            base_info = parse_base_info(selector, "")
            internal_id = base_info['内部编号']
            img_uris = selector.xpath("//div[@id='det_right']//a[@class='img']/img/@src")
            full_images_urls = selector.xpath("//div[@id='det_right']//a[@class='img']/@href")
            images = []
            image = ''
            # print("[No. {}] {}: {} images".format(count, _path[:-23], len(full_images_urls)))
            for url in full_images_urls:
                if url.startswith("http"):
                    print(url)
                    img_file_name = url.split("/")[-1]
                    images.append(img_file_name)
            # for uri in img_uris:
            #     if uri.startswith("./"):
            #         img_file_name = uri.split("/")[-1]
            #         images.append(img_file_name)
            #         shutil.copy(target_file_path + uri[1:], image_path)
            # if len(images) > 0:
            #     image = images[0]
            sql_update = "UPDATE nfm.m_base_page SET full_size_images = '{}' WHERE internal_id = '{}'".format(
                json.dumps(images, ensure_ascii=False), internal_id)
            cursor.execute(sql_update)
    db.commit()
    db.close()
init 2025-05-28 19:16:17 +08:00			`import json`
			`import os`

			`import pymysql`
			`from lxml import etree`

			`from nfm.mbase.page_parser import parse_base_info`

			`target_file_path = r"E:/yuxin/nuofang-data/base/webpage"`
			`image_path = r"E:/yuxin/nuofang-data/base/images"`
			`full_image_path = r"E:/yuxin/nuofang-data/base/full_images"`

			`if __name__ == '__main__':`
			`count = 0`
			`db = pymysql.connect(host='39.101.194.63', port=23306,`
			`user='root', passwd='passok123A', db='nfm', charset='utf8mb4')`
			`cursor = db.cursor()`
			`for _path in os.listdir(target_file_path):`
			`if _path.endswith('.html'):`
			`# if _path.endswith('埃尔门多夫-理查森堡联合基地_综合类设施_全球军事态势情报数据库.html'):`
			`count += 1`
			`target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')`
			`html_content = target_file.read().replace('\n', '')`
			`target_file.close()`
			`selector = etree.HTML(html_content)`
			`base_info = parse_base_info(selector, "")`
			`internal_id = base_info['内部编号']`
			`img_uris = selector.xpath("//div[@id='det_right']//a[@class='img']/img/@src")`
			`full_images_urls = selector.xpath("//div[@id='det_right']//a[@class='img']/@href")`
			`images = []`
			`image = ''`
			`# print("[No. {}] {}: {} images".format(count, _path[:-23], len(full_images_urls)))`
			`for url in full_images_urls:`
			`if url.startswith("http"):`
			`print(url)`
			`img_file_name = url.split("/")[-1]`
			`images.append(img_file_name)`
			`# for uri in img_uris:`
			`# if uri.startswith("./"):`
			`# img_file_name = uri.split("/")[-1]`
			`# images.append(img_file_name)`
			`# shutil.copy(target_file_path + uri[1:], image_path)`
			`# if len(images) > 0:`
			`# image = images[0]`
			`sql_update = "UPDATE nfm.m_base_page SET full_size_images = '{}' WHERE internal_id = '{}'".format(`
			`json.dumps(images, ensure_ascii=False), internal_id)`
			`cursor.execute(sql_update)`
			`db.commit()`
			`db.close()`