184 lines
9.4 KiB
Python
184 lines
9.4 KiB
Python
# -*- coding: utf-8 -*-
|
||
import json
|
||
import logging as logger
|
||
import time
|
||
|
||
import validators
|
||
from scrapy.loader import ItemLoader
|
||
|
||
from WebsiteSpider.items import WebsiteSpiderItem
|
||
from WebsiteSpider.utils.date_utils import transfer_time_zone, get_time_stamp
|
||
from WebsiteSpider.utils.http_utils import filter_html_tags, build_url
|
||
from WebsiteSpider.utils.traslate_utils import translate_single, translate_content_with_paragraphs, update_record
|
||
|
||
|
||
def parse_item_from_response(response, parse_rule, redis_client):
|
||
current_url = response.url
|
||
allowed_domains = parse_rule['allowDomain'].split(';')
|
||
mapping = parse_rule['fieldMappings']
|
||
time_zone = float(parse_rule['timeZone'])
|
||
webpage_item = WebsiteSpiderItem()
|
||
item_loader = ItemLoader(item=webpage_item, response=response)
|
||
# 根据内容页正则判断使用哪一套解析模板
|
||
for field in webpage_item.fields:
|
||
if field in mapping:
|
||
if field == 'es_urlimage':
|
||
# item_loader.add_xpath(field, mapping[field] + '/@src')
|
||
# 图片的真实链接存储在其他属性中,不在src属性中,需要在规则中添加@,否则规则中不需要具体到@
|
||
if '@' in str(mapping[field]).split('/')[-1]:
|
||
item_loader.add_xpath(field, mapping[field])
|
||
else:
|
||
item_loader.add_xpath(field, mapping[field] + '/@src')
|
||
elif field not in ['es_urlcontent']:
|
||
rule_suffix = mapping[field].split('/')[-1]
|
||
# 如果配置规则中已添加/text()或者使用@提取了属性值,则不加//text()
|
||
if rule_suffix[:6] == 'text()' or rule_suffix[0] == '@':
|
||
item_loader.add_xpath(field, mapping[field])
|
||
else:
|
||
item_loader.add_xpath(field, mapping[field] + '//text()')
|
||
else:
|
||
item_loader.add_xpath(field, mapping[field])
|
||
webpage_item = item_loader.load_item()
|
||
for field in webpage_item:
|
||
if webpage_item[field] is None:
|
||
continue
|
||
if field == 'es_urlcontent':
|
||
src = '@src' # 图片标签默认使用的链接存储在src属性中
|
||
if 'es_urlimage' in mapping:
|
||
# new_content_xpath = mapping[field] + ' | ' + mapping['es_urlimage']
|
||
if '@' in str(mapping['es_urlimage']).split('/')[-1]: # 图片的真实链接存储在其他属性中,不在src属性中
|
||
src = str(mapping['es_urlimage']).split('/')[-1]
|
||
new_content_xpath = mapping[field] + ' | ' + str(mapping['es_urlimage']).replace(
|
||
'/' + src, '')
|
||
else:
|
||
new_content_xpath = mapping[field] + ' | ' + mapping['es_urlimage']
|
||
else:
|
||
new_content_xpath = mapping[field]
|
||
element_list = response.xpath(new_content_xpath)
|
||
page_content = ''
|
||
for element in element_list:
|
||
# 保留图片在原始网页中位置
|
||
if element.xpath(src):
|
||
img_src = element.xpath(src).get()
|
||
if '?' in img_src:
|
||
img_src = img_src.split('?')[0]
|
||
logger.info(img_src)
|
||
img_src = build_url(current_url, allowed_domains, img_src)
|
||
if not validators.url(img_src):
|
||
continue
|
||
# 避免插入重复图片
|
||
if img_src not in str(webpage_item['es_urlcontent']):
|
||
page_content += "<img src=%s>" % img_src
|
||
page_content += '<br/>\r\n'
|
||
else:
|
||
text_element = element.getall()
|
||
for t in text_element:
|
||
t = t.strip('\r\n')
|
||
if t.isspace():
|
||
continue
|
||
page_content += t
|
||
page_content += '<br/>\r\n'
|
||
page_content = filter_html_tags(page_content)
|
||
webpage_item[field] = page_content
|
||
elif field == 'es_urlimage' or field == 'es_attachment':
|
||
temp_urls = webpage_item[field]
|
||
webpage_item[field] = []
|
||
for temp_url in temp_urls:
|
||
if field == 'es_urlimage' and '?' in temp_url:
|
||
temp_url = temp_url.split('?')[0]
|
||
full_url = build_url(current_url, allowed_domains, temp_url)
|
||
if validators.url(full_url):
|
||
webpage_item[field].append(full_url)
|
||
logger.info(webpage_item[field])
|
||
else:
|
||
webpage_item[field] = ''.join(webpage_item[field])
|
||
|
||
webpage_item['es_sitename'] = parse_rule['siteName']
|
||
webpage_item['es_srcname'] = parse_rule['url']
|
||
webpage_item['es_startid'] = parse_rule['id']
|
||
webpage_item['es_tags'] = ''
|
||
if 'siteType' in parse_rule and parse_rule['siteType']:
|
||
webpage_item['es_catalog'] = parse_rule['siteType']
|
||
if 'resourceType' in parse_rule and parse_rule['resourceType']:
|
||
webpage_item['es_collection'] = parse_rule['resourceType']
|
||
webpage_item['es_urlname'] = response.url
|
||
logger.info("urlname: %s" % webpage_item['es_urlname'])
|
||
webpage_item['es_loadtime'] = str(int(time.time() * 1000))
|
||
webpage_item['es_carriertype'] = parse_rule['carrierType']
|
||
webpage_item['es_lang'] = parse_rule['region']
|
||
webpage_item['es_urltopic'] = json.dumps(parse_rule['storageOption'])
|
||
webpage_item['es_warning'] = json.dumps(parse_rule['keywordFilter'])
|
||
if 'es_urlcontent' in webpage_item:
|
||
webpage_item['es_doclength'] = len(webpage_item['es_urlcontent'])
|
||
else:
|
||
webpage_item['es_doclength'] = 0
|
||
# 根据正则表达式对时间进行处理,转换成时间戳
|
||
try:
|
||
logger.info("urltime: %s" % webpage_item['es_urltime'])
|
||
except KeyError:
|
||
logger.info('时间解析失败,当前页面url: %s' % response.url)
|
||
|
||
time_parse_rule = None
|
||
if 'dateReg' in mapping:
|
||
time_parse_rule = {
|
||
mapping['dateReg']: [mapping['dateFormate']]
|
||
}
|
||
for time_field in ['es_lasttime', 'es_urltime']:
|
||
if time_field in webpage_item and len(webpage_item[time_field]) > 0:
|
||
webpage_item[time_field] = transfer_time_zone(
|
||
get_time_stamp(webpage_item[time_field], time_parse_rule), time_zone)
|
||
try:
|
||
logger.info("es_urltitle: %s" % webpage_item['es_urltitle'])
|
||
except KeyError:
|
||
logger.info('标题解析失败,当前页面url: %s' % response.url)
|
||
try:
|
||
logger.info("es_urlcontent: %s" % webpage_item['es_urlcontent'][:50] + '......')
|
||
except KeyError:
|
||
logger.info('内容解析失败,当前页面url: %s' % response.url)
|
||
|
||
title_is_not_empty = 'es_urltitle' in webpage_item and webpage_item['es_urltitle'] != ''
|
||
content_is_not_empty = 'es_urlcontent' in webpage_item and webpage_item['es_urlcontent'] != ''
|
||
doc_length_is_valid = 'docLength' not in parse_rule or webpage_item['es_doclength'] >= parse_rule['docLength']
|
||
time_is_valid = 'es_urltime' in webpage_item and webpage_item['es_urltime'] != '' and webpage_item[
|
||
'es_urltime'] != '0' and 0 < int(webpage_item['es_urltime']) < int(time.time() * 1000)
|
||
filter_VIP_content = 'es_positiveWords' in webpage_item and mapping['filterVIPWords'] in webpage_item[
|
||
'es_positiveWords']
|
||
|
||
yield_flag = title_is_not_empty and content_is_not_empty and doc_length_is_valid and time_is_valid and not filter_VIP_content
|
||
if not yield_flag:
|
||
logger.info('URL为' + response.url + '的内容未解析,该页面可能不是内容页,也可能由下列解析规则问题导致')
|
||
if not title_is_not_empty:
|
||
logger.info('标题为空,解析规则是:' + mapping['es_urltitle'])
|
||
if not content_is_not_empty:
|
||
logger.info('内容为空:解析规则是:' + mapping['es_urlcontent'])
|
||
if not doc_length_is_valid:
|
||
logger.info('正文长度不满足要求,最小长度为:' + parse_rule['docLength'])
|
||
if not time_is_valid:
|
||
logger.info('时间无法解析,解析规则是:' + mapping['es_urltime'])
|
||
if filter_VIP_content:
|
||
logger.info('当前内容是VIP文章,并不完整,已经过滤。')
|
||
if yield_flag:
|
||
try:
|
||
# 1. 从 Redis 获取原始数据
|
||
raw_urls = redis_client.lrange('WebsiteSpider:translate_sites', 0, -1)
|
||
translate_list = [
|
||
url_bytes.decode('utf-8').strip()
|
||
for url_bytes in raw_urls
|
||
if url_bytes and url_bytes.decode('utf-8').strip()
|
||
]
|
||
if webpage_item['es_srcname'] in translate_list:
|
||
# 翻译标题
|
||
webpage_item['es_abstract'] = translate_single(webpage_item['es_urltitle'])
|
||
if webpage_item['es_abstract'] is None:
|
||
logger.warning(" → 标题翻译失败,跳过整条")
|
||
else:
|
||
logger.info(f"翻译成功,标题译文长度:{len(webpage_item['es_abstract'])}")
|
||
# 翻译内容(按段落,容错)
|
||
no_tag_content = filter_html_tags(webpage_item['es_urlcontent'], retain_img_br=False)
|
||
webpage_item['es_content'] = translate_content_with_paragraphs(no_tag_content)
|
||
logger.info(f"翻译成功,内容译文长度:{len(webpage_item['es_content'])}")
|
||
except Exception as e:
|
||
logger.error(repr(e))
|
||
|
||
return yield_flag, webpage_item
|