65 lines
3.0 KiB
Python
65 lines
3.0 KiB
Python
import scrapy
|
||
|
||
|
||
class ForumspidersItem(scrapy.Item):
|
||
pass
|
||
|
||
|
||
class ZhihuItem(scrapy.Item):
|
||
# (问题) (回答) (评论)
|
||
item_id = scrapy.Field() # 随机生成ID 随机生成ID 随机生成ID
|
||
content_type = scrapy.Field() # q a c
|
||
question_id = scrapy.Field() # 问题ID 关联问题ID 关联问题ID
|
||
answer_id = scrapy.Field() # 回答ID 关联回答ID
|
||
comment_id = scrapy.Field() # 评论ID
|
||
question_title = scrapy.Field() # 问题标题 问题标题 问题标题
|
||
question_link = scrapy.Field() # 问题链接 问题链接 问题链接
|
||
answer_link = scrapy.Field() # 回答链接 回答链接
|
||
content_text = scrapy.Field() # 内容文本 内容文本 内容文本
|
||
content_html = scrapy.Field() # 内容HTML 内容HTML 内容HTML
|
||
image_urls = scrapy.Field() # 图片链接 图片链接 图片链接
|
||
author_id = scrapy.Field() # 作者ID 作者ID 作者ID
|
||
author_name = scrapy.Field() # 作者名 作者名 作者名
|
||
author_link = scrapy.Field() # 作者主页 作者主页 作者主页
|
||
upvote_num = scrapy.Field() # 好问题数 赞同数 点赞数
|
||
comment_num = scrapy.Field() # 评论数 评论数
|
||
date_created = scrapy.Field() # 发布时间戳 发布时间戳 发布时间戳
|
||
date_modified = scrapy.Field() # 修改时间戳 修改时间戳 修改时间戳
|
||
remarks = scrapy.Field() # 备注 备注 备注
|
||
meta = scrapy.Field() # 元数据 元数据 元数据
|
||
|
||
|
||
class MdcTwItem(scrapy.Item):
|
||
item_id = scrapy.Field()
|
||
author_name = scrapy.Field()
|
||
author_link = scrapy.Field()
|
||
author_info = scrapy.Field()
|
||
post_time = scrapy.Field()
|
||
post_content = scrapy.Field()
|
||
content_image_urls = scrapy.Field()
|
||
post_title = scrapy.Field()
|
||
post_id = scrapy.Field()
|
||
|
||
|
||
class EsItem(scrapy.Item):
|
||
es_sid = scrapy.Field() # sid
|
||
es_hkey = scrapy.Field() # 根据发帖内容、时间、作者等生成的 ID
|
||
es_urlname = scrapy.Field() # 发帖内容的 URL,如果是回帖则再加上 hkey
|
||
es_sitename = scrapy.Field() # 站点名称
|
||
es_srcname = scrapy.Field() # 来源站点 URL
|
||
es_urltitle = scrapy.Field() # 发帖标题(主贴的标题,跟帖采用主帖标题)
|
||
es_urltopic = scrapy.Field() # 主贴 ID,这个字段用来关联更贴和主帖
|
||
es_isrepost = scrapy.Field() # 0-主贴;1-跟帖
|
||
es_lasttime = scrapy.Field() # 更新时间
|
||
es_loadtime = scrapy.Field() # 入库时间
|
||
es_urltime = scrapy.Field() # 发布时间
|
||
es_authors = scrapy.Field() # 作者
|
||
es_urlimage = scrapy.Field() # 图片内容
|
||
es_doclength = scrapy.Field() # 贴文长度
|
||
es_content = scrapy.Field() # 发帖内容
|
||
es_urlcontent = scrapy.Field() # 发帖内容(去除 html 标签)
|
||
es_carriertype = scrapy.Field() # 赋值为 forum
|
||
es_commentcount = scrapy.Field() # 回帖数,这个字段不采,留给业务系统实时查询并赋值
|
||
es_likecount = scrapy.Field() # 点赞或赞同数
|
||
es_attachment = scrapy.Field() # 附件内容
|