2026-02-12 09:47:28 +08:00

184 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import json
import logging as logger
import time
import validators
from scrapy.loader import ItemLoader
from WebsiteSpider.items import WebsiteSpiderItem
from WebsiteSpider.utils.date_utils import transfer_time_zone, get_time_stamp
from WebsiteSpider.utils.http_utils import filter_html_tags, build_url
from WebsiteSpider.utils.traslate_utils import translate_single, translate_content_with_paragraphs, update_record
def parse_item_from_response(response, parse_rule, redis_client):
current_url = response.url
allowed_domains = parse_rule['allowDomain'].split(';')
mapping = parse_rule['fieldMappings']
time_zone = float(parse_rule['timeZone'])
webpage_item = WebsiteSpiderItem()
item_loader = ItemLoader(item=webpage_item, response=response)
# 根据内容页正则判断使用哪一套解析模板
for field in webpage_item.fields:
if field in mapping:
if field == 'es_urlimage':
# item_loader.add_xpath(field, mapping[field] + '/@src')
# 图片的真实链接存储在其他属性中不在src属性中,需要在规则中添加@,否则规则中不需要具体到@
if '@' in str(mapping[field]).split('/')[-1]:
item_loader.add_xpath(field, mapping[field])
else:
item_loader.add_xpath(field, mapping[field] + '/@src')
elif field not in ['es_urlcontent']:
rule_suffix = mapping[field].split('/')[-1]
# 如果配置规则中已添加/text()或者使用@提取了属性值,则不加//text()
if rule_suffix[:6] == 'text()' or rule_suffix[0] == '@':
item_loader.add_xpath(field, mapping[field])
else:
item_loader.add_xpath(field, mapping[field] + '//text()')
else:
item_loader.add_xpath(field, mapping[field])
webpage_item = item_loader.load_item()
for field in webpage_item:
if webpage_item[field] is None:
continue
if field == 'es_urlcontent':
src = '@src' # 图片标签默认使用的链接存储在src属性中
if 'es_urlimage' in mapping:
# new_content_xpath = mapping[field] + ' | ' + mapping['es_urlimage']
if '@' in str(mapping['es_urlimage']).split('/')[-1]: # 图片的真实链接存储在其他属性中不在src属性中
src = str(mapping['es_urlimage']).split('/')[-1]
new_content_xpath = mapping[field] + ' | ' + str(mapping['es_urlimage']).replace(
'/' + src, '')
else:
new_content_xpath = mapping[field] + ' | ' + mapping['es_urlimage']
else:
new_content_xpath = mapping[field]
element_list = response.xpath(new_content_xpath)
page_content = ''
for element in element_list:
# 保留图片在原始网页中位置
if element.xpath(src):
img_src = element.xpath(src).get()
if '?' in img_src:
img_src = img_src.split('?')[0]
logger.info(img_src)
img_src = build_url(current_url, allowed_domains, img_src)
if not validators.url(img_src):
continue
# 避免插入重复图片
if img_src not in str(webpage_item['es_urlcontent']):
page_content += "<img src=%s>" % img_src
page_content += '<br/>\r\n'
else:
text_element = element.getall()
for t in text_element:
t = t.strip('\r\n')
if t.isspace():
continue
page_content += t
page_content += '<br/>\r\n'
page_content = filter_html_tags(page_content)
webpage_item[field] = page_content
elif field == 'es_urlimage' or field == 'es_attachment':
temp_urls = webpage_item[field]
webpage_item[field] = []
for temp_url in temp_urls:
if field == 'es_urlimage' and '?' in temp_url:
temp_url = temp_url.split('?')[0]
full_url = build_url(current_url, allowed_domains, temp_url)
if validators.url(full_url):
webpage_item[field].append(full_url)
logger.info(webpage_item[field])
else:
webpage_item[field] = ''.join(webpage_item[field])
webpage_item['es_sitename'] = parse_rule['siteName']
webpage_item['es_srcname'] = parse_rule['url']
webpage_item['es_startid'] = parse_rule['id']
webpage_item['es_tags'] = ''
if 'siteType' in parse_rule and parse_rule['siteType']:
webpage_item['es_catalog'] = parse_rule['siteType']
if 'resourceType' in parse_rule and parse_rule['resourceType']:
webpage_item['es_collection'] = parse_rule['resourceType']
webpage_item['es_urlname'] = response.url
logger.info("urlname: %s" % webpage_item['es_urlname'])
webpage_item['es_loadtime'] = str(int(time.time() * 1000))
webpage_item['es_carriertype'] = parse_rule['carrierType']
webpage_item['es_lang'] = parse_rule['region']
webpage_item['es_urltopic'] = json.dumps(parse_rule['storageOption'])
webpage_item['es_warning'] = json.dumps(parse_rule['keywordFilter'])
if 'es_urlcontent' in webpage_item:
webpage_item['es_doclength'] = len(webpage_item['es_urlcontent'])
else:
webpage_item['es_doclength'] = 0
# 根据正则表达式对时间进行处理,转换成时间戳
try:
logger.info("urltime: %s" % webpage_item['es_urltime'])
except KeyError:
logger.info('时间解析失败当前页面url: %s' % response.url)
time_parse_rule = None
if 'dateReg' in mapping:
time_parse_rule = {
mapping['dateReg']: [mapping['dateFormate']]
}
for time_field in ['es_lasttime', 'es_urltime']:
if time_field in webpage_item and len(webpage_item[time_field]) > 0:
webpage_item[time_field] = transfer_time_zone(
get_time_stamp(webpage_item[time_field], time_parse_rule), time_zone)
try:
logger.info("es_urltitle: %s" % webpage_item['es_urltitle'])
except KeyError:
logger.info('标题解析失败当前页面url: %s' % response.url)
try:
logger.info("es_urlcontent: %s" % webpage_item['es_urlcontent'][:50] + '......')
except KeyError:
logger.info('内容解析失败当前页面url: %s' % response.url)
title_is_not_empty = 'es_urltitle' in webpage_item and webpage_item['es_urltitle'] != ''
content_is_not_empty = 'es_urlcontent' in webpage_item and webpage_item['es_urlcontent'] != ''
doc_length_is_valid = 'docLength' not in parse_rule or webpage_item['es_doclength'] >= parse_rule['docLength']
time_is_valid = 'es_urltime' in webpage_item and webpage_item['es_urltime'] != '' and webpage_item[
'es_urltime'] != '0' and 0 < int(webpage_item['es_urltime']) < int(time.time() * 1000)
filter_VIP_content = 'es_positiveWords' in webpage_item and mapping['filterVIPWords'] in webpage_item[
'es_positiveWords']
yield_flag = title_is_not_empty and content_is_not_empty and doc_length_is_valid and time_is_valid and not filter_VIP_content
if not yield_flag:
logger.info('URL为' + response.url + '的内容未解析,该页面可能不是内容页,也可能由下列解析规则问题导致')
if not title_is_not_empty:
logger.info('标题为空,解析规则是:' + mapping['es_urltitle'])
if not content_is_not_empty:
logger.info('内容为空:解析规则是:' + mapping['es_urlcontent'])
if not doc_length_is_valid:
logger.info('正文长度不满足要求,最小长度为:' + parse_rule['docLength'])
if not time_is_valid:
logger.info('时间无法解析,解析规则是:' + mapping['es_urltime'])
if filter_VIP_content:
logger.info('当前内容是VIP文章并不完整已经过滤。')
if yield_flag:
try:
# 1. 从 Redis 获取原始数据
raw_urls = redis_client.lrange('WebsiteSpider:translate_sites', 0, -1)
translate_list = [
url_bytes.decode('utf-8').strip()
for url_bytes in raw_urls
if url_bytes and url_bytes.decode('utf-8').strip()
]
if webpage_item['es_srcname'] in translate_list:
# 翻译标题
webpage_item['es_abstract'] = translate_single(webpage_item['es_urltitle'])
if webpage_item['es_abstract'] is None:
logger.warning(" → 标题翻译失败,跳过整条")
else:
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
# 翻译内容(按段落,容错)
no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False)
webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content)
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
except Exception as e:
logger.error(repr(e))
return yield_flag, webpage_item