osc/research/nuofang-db/nfm/mact/get_source_site.py
2025-05-28 19:16:17 +08:00

39 lines
1.6 KiB
Python

import os
from urllib.parse import urlparse
import pymysql
from lxml import etree
from nfm.mact.page_parser import parse_base_info
target_file_path = r"E:/yuxin/nuofang-data/activity/webpage"
if __name__ == '__main__':
db = pymysql.connect(host='39.101.194.63', port=23306,
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
cursor = db.cursor()
count = 0
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
count += 1
webpage_info = {}
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
title = ''.join(title_info).strip()
source_info = selector.xpath("/html/body/div[2]/div[1]/div[2]/div[3]/div/div/div[5]/a/text()")
source = ''.join(source_info)
domain = ''
if len(source) > 0:
domain = urlparse(source).netloc
base_info = parse_base_info(selector)
internal_id = base_info['内部编号']
SQL_UPDATE = "UPDATE nfm.m_act SET info_source_url = '{}' WHERE internal_id = '{}'".format(source,
internal_id)
cursor.execute(SQL_UPDATE)
print("[No. {}]".format(count))
db.commit()
db.close()