2025-05-28 19:16:17 +08:00

65 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import scrapy
class ForumspidersItem(scrapy.Item):
pass
class ZhihuItem(scrapy.Item):
# (问题) (回答) (评论)
item_id = scrapy.Field() # 随机生成ID 随机生成ID 随机生成ID
content_type = scrapy.Field() # q a c
question_id = scrapy.Field() # 问题ID 关联问题ID 关联问题ID
answer_id = scrapy.Field() # 回答ID 关联回答ID
comment_id = scrapy.Field() # 评论ID
question_title = scrapy.Field() # 问题标题 问题标题 问题标题
question_link = scrapy.Field() # 问题链接 问题链接 问题链接
answer_link = scrapy.Field() # 回答链接 回答链接
content_text = scrapy.Field() # 内容文本 内容文本 内容文本
content_html = scrapy.Field() # 内容HTML 内容HTML 内容HTML
image_urls = scrapy.Field() # 图片链接 图片链接 图片链接
author_id = scrapy.Field() # 作者ID 作者ID 作者ID
author_name = scrapy.Field() # 作者名 作者名 作者名
author_link = scrapy.Field() # 作者主页 作者主页 作者主页
upvote_num = scrapy.Field() # 好问题数 赞同数 点赞数
comment_num = scrapy.Field() # 评论数 评论数
date_created = scrapy.Field() # 发布时间戳 发布时间戳 发布时间戳
date_modified = scrapy.Field() # 修改时间戳 修改时间戳 修改时间戳
remarks = scrapy.Field() # 备注 备注 备注
meta = scrapy.Field() # 元数据 元数据 元数据
class MdcTwItem(scrapy.Item):
item_id = scrapy.Field()
author_name = scrapy.Field()
author_link = scrapy.Field()
author_info = scrapy.Field()
post_time = scrapy.Field()
post_content = scrapy.Field()
content_image_urls = scrapy.Field()
post_title = scrapy.Field()
post_id = scrapy.Field()
class EsItem(scrapy.Item):
es_sid = scrapy.Field() # sid
es_hkey = scrapy.Field() # 根据发帖内容、时间、作者等生成的 ID
es_urlname = scrapy.Field() # 发帖内容的 URL如果是回帖则再加上 hkey
es_sitename = scrapy.Field() # 站点名称
es_srcname = scrapy.Field() # 来源站点 URL
es_urltitle = scrapy.Field() # 发帖标题(主贴的标题,跟帖采用主帖标题)
es_urltopic = scrapy.Field() # 主贴 ID这个字段用来关联更贴和主帖
es_isrepost = scrapy.Field() # 0-主贴1-跟帖
es_lasttime = scrapy.Field() # 更新时间
es_loadtime = scrapy.Field() # 入库时间
es_urltime = scrapy.Field() # 发布时间
es_authors = scrapy.Field() # 作者
es_urlimage = scrapy.Field() # 图片内容
es_doclength = scrapy.Field() # 贴文长度
es_content = scrapy.Field() # 发帖内容
es_urlcontent = scrapy.Field() # 发帖内容(去除 html 标签)
es_carriertype = scrapy.Field() # 赋值为 forum
es_commentcount = scrapy.Field() # 回帖数,这个字段不采,留给业务系统实时查询并赋值
es_likecount = scrapy.Field() # 点赞或赞同数
es_attachment = scrapy.Field() # 附件内容