osc/research/nuofang-db/nfm/mperson/avatar_parser.py

import os
import re
import shutil

import pymysql
from lxml import etree

target_file_path = r"E:\yuxin\nuofang-data\person\webpage"


def trim_space(input_str):
    result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " "))
    return result


def parse_avatar_info(current_selector):
    img_url_info = current_selector.xpath("//table[@class='good-table table1']//img/@src")
    img_url = "".join(img_url_info).strip()
    return img_url


if __name__ == '__main__':
    count = 0
    db = pymysql.connect(host='39.101.194.63', port=23306,
                         user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
    cursor = db.cursor()
    for _path in os.listdir(target_file_path):
        if _path.endswith('.html'):
            # if _path.endswith('梅家樹_军政主官_全球军事态势情报数据库.html'):
            count += 1
            webpage_info = {}
            target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
            html_content = target_file.read().replace('\n', '')
            target_file.close()
            selector = etree.HTML(html_content)
            title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
            title = ''.join(title_info).strip()

            url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
            page_url = url_info.replace("#", "")
            avatar_url = parse_avatar_info(selector)
            person_id = page_url[37:45]
            if len(avatar_url) > 0:
                avatar_file_name = avatar_url.split("/")[-1]
                sql_update = "UPDATE nfm.m_person_page SET images = '{}' WHERE id = {} AND images IS NULL".format(
                    avatar_file_name, person_id)
                shutil.copy(r"E:\yuxin\nuofang-data\person\webpage" + avatar_url[1:],
                            r"E:\yuxin\nuofang-data\person\person-avatar-2")
                cursor.execute(sql_update)
                print([person_id, avatar_file_name])
    db.commit()
    db.close()
init 2025-05-28 19:16:17 +08:00			`import os`
			`import re`
			`import shutil`

			`import pymysql`
			`from lxml import etree`

			`target_file_path = r"E:\yuxin\nuofang-data\person\webpage"`


			`def trim_space(input_str):`
			`result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " "))`
			`return result`


			`def parse_avatar_info(current_selector):`
			`img_url_info = current_selector.xpath("//table[@class='good-table table1']//img/@src")`
			`img_url = "".join(img_url_info).strip()`
			`return img_url`


			`if __name__ == '__main__':`
			`count = 0`
			`db = pymysql.connect(host='39.101.194.63', port=23306,`
			`user='root', passwd='passok123A', db='nfm', charset='utf8mb4')`
			`cursor = db.cursor()`
			`for _path in os.listdir(target_file_path):`
			`if _path.endswith('.html'):`
			`# if _path.endswith('梅家樹_军政主官_全球军事态势情报数据库.html'):`
			`count += 1`
			`webpage_info = {}`
			`target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')`
			`html_content = target_file.read().replace('\n', '')`
			`target_file.close()`
			`selector = etree.HTML(html_content)`
			`title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")`
			`title = ''.join(title_info).strip()`

			`url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]`
			`page_url = url_info.replace("#", "")`
			`avatar_url = parse_avatar_info(selector)`
			`person_id = page_url[37:45]`
			`if len(avatar_url) > 0:`
			`avatar_file_name = avatar_url.split("/")[-1]`
			`sql_update = "UPDATE nfm.m_person_page SET images = '{}' WHERE id = {} AND images IS NULL".format(`
			`avatar_file_name, person_id)`
			`shutil.copy(r"E:\yuxin\nuofang-data\person\webpage" + avatar_url[1:],`
			`r"E:\yuxin\nuofang-data\person\person-avatar-2")`
			`cursor.execute(sql_update)`
			`print([person_id, avatar_file_name])`
			`db.commit()`
			`db.close()`