65 lines
3.0 KiB
Python
Raw Normal View History

2025-05-28 19:16:17 +08:00
import scrapy
class ForumspidersItem(scrapy.Item):
pass
class ZhihuItem(scrapy.Item):
# (问题) (回答) (评论)
item_id = scrapy.Field() # 随机生成ID 随机生成ID 随机生成ID
content_type = scrapy.Field() # q a c
question_id = scrapy.Field() # 问题ID 关联问题ID 关联问题ID
answer_id = scrapy.Field() # 回答ID 关联回答ID
comment_id = scrapy.Field() # 评论ID
question_title = scrapy.Field() # 问题标题 问题标题 问题标题
question_link = scrapy.Field() # 问题链接 问题链接 问题链接
answer_link = scrapy.Field() # 回答链接 回答链接
content_text = scrapy.Field() # 内容文本 内容文本 内容文本
content_html = scrapy.Field() # 内容HTML 内容HTML 内容HTML
image_urls = scrapy.Field() # 图片链接 图片链接 图片链接
author_id = scrapy.Field() # 作者ID 作者ID 作者ID
author_name = scrapy.Field() # 作者名 作者名 作者名
author_link = scrapy.Field() # 作者主页 作者主页 作者主页
upvote_num = scrapy.Field() # 好问题数 赞同数 点赞数
comment_num = scrapy.Field() # 评论数 评论数
date_created = scrapy.Field() # 发布时间戳 发布时间戳 发布时间戳
date_modified = scrapy.Field() # 修改时间戳 修改时间戳 修改时间戳
remarks = scrapy.Field() # 备注 备注 备注
meta = scrapy.Field() # 元数据 元数据 元数据
class MdcTwItem(scrapy.Item):
item_id = scrapy.Field()
author_name = scrapy.Field()
author_link = scrapy.Field()
author_info = scrapy.Field()
post_time = scrapy.Field()
post_content = scrapy.Field()
content_image_urls = scrapy.Field()
post_title = scrapy.Field()
post_id = scrapy.Field()
class EsItem(scrapy.Item):
es_sid = scrapy.Field() # sid
es_hkey = scrapy.Field() # 根据发帖内容、时间、作者等生成的 ID
es_urlname = scrapy.Field() # 发帖内容的 URL如果是回帖则再加上 hkey
es_sitename = scrapy.Field() # 站点名称
es_srcname = scrapy.Field() # 来源站点 URL
es_urltitle = scrapy.Field() # 发帖标题(主贴的标题,跟帖采用主帖标题)
es_urltopic = scrapy.Field() # 主贴 ID这个字段用来关联更贴和主帖
es_isrepost = scrapy.Field() # 0-主贴1-跟帖
es_lasttime = scrapy.Field() # 更新时间
es_loadtime = scrapy.Field() # 入库时间
es_urltime = scrapy.Field() # 发布时间
es_authors = scrapy.Field() # 作者
es_urlimage = scrapy.Field() # 图片内容
es_doclength = scrapy.Field() # 贴文长度
es_content = scrapy.Field() # 发帖内容
es_urlcontent = scrapy.Field() # 发帖内容(去除 html 标签)
es_carriertype = scrapy.Field() # 赋值为 forum
es_commentcount = scrapy.Field() # 回帖数,这个字段不采,留给业务系统实时查询并赋值
es_likecount = scrapy.Field() # 点赞或赞同数
es_attachment = scrapy.Field() # 附件内容