Compare commits

..

3 Commits

Author SHA1 Message Date
yuxin-pc
cfe6c3af85 本地改动同步 2026-02-12 09:36:36 +08:00
yuxin-pc
63b0d7090a Update settings.py
翻译字数设置
2026-02-12 09:36:06 +08:00
yuxin-pc
dfb65f11bf 针对6s采集的改动 2026-02-12 09:35:23 +08:00
7 changed files with 41 additions and 45 deletions

View File

@ -21,14 +21,11 @@ DB_CONFIG = {
TRANSLATE_API_URL = "http://47.113.231.200:28081/translate" TRANSLATE_API_URL = "http://47.113.231.200:28081/translate"
# 指定时间格式YYYY-MM-DD HH:MM:SS # 指定时间格式YYYY-MM-DD HH:MM:SS
LOADTIME_AFTER = "2026-01-16 10:40:00" LOADTIME_AFTER = "2026-02-10 11:59:00"
# 目标站点列表 # 目标站点列表
TARGET_SRCNAMES = [ TARGET_SRCNAMES = [
'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA==', 'https://www.38north.org/' # 添加你的站点
'http://www.kcna.kp/kp/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf',
'https://energynow.com/category/press_releases/',
'https://www.fao.org/newsroom/en' # 添加你的站点
] ]
# 单次请求间隔(秒),避免 API 被限流 # 单次请求间隔(秒),避免 API 被限流
@ -104,7 +101,7 @@ def translate_content_with_paragraphs(content: str) -> str:
def update_record(cursor, es_sid: int, new_title: str, new_content: str): def update_record(cursor, es_sid: int, new_title: str, new_content: str):
update_query = """ update_query = """
UPDATE indeximos UPDATE indeximos
SET es_title = % s, es_content = % s SET es_abstract = % s, es_content = % s
WHERE es_sid = % s WHERE es_sid = % s
""" """
cursor.execute(update_query, (new_title, new_content, es_sid)) cursor.execute(update_query, (new_title, new_content, es_sid))
@ -122,8 +119,8 @@ def main():
SELECT es_sid, es_urltitle, es_urlcontent SELECT es_sid, es_urltitle, es_urlcontent
FROM indeximos FROM indeximos
WHERE es_loadtime > %s WHERE es_loadtime > %s
AND (es_title IS NULL OR TRIM(es_title) = '') AND (es_content IS NULL OR TRIM(es_content) = '')
AND es_srcname IN ({placeholders}) -- AND es_srcname IN ({placeholders})
AND LENGTH(es_video) > 5 AND LENGTH(es_video) > 5
""" """
params = [LOADTIME_AFTER] + TARGET_SRCNAMES params = [LOADTIME_AFTER] + TARGET_SRCNAMES

View File

@ -76,7 +76,7 @@ class SeleniumMiddleware:
} }
edge_options.add_experimental_option("prefs", prefs) edge_options.add_experimental_option("prefs", prefs)
self.driver = Edge(executable_path="C:/Users/DELL/Downloads/edgedriver_win64/msedgedriver.exe", options=edge_options) self.driver = Edge(executable_path=r"C:\Program Files\Python38\msedgedriver.exe", options=edge_options)
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):

View File

@ -110,8 +110,8 @@ CUSTOM_USER_AGENT = [
# 部署在外网采集fb时使用selenium_chrome # 部署在外网采集fb时使用selenium_chrome
SELENIUM_DRIVER_NAME = 'chrome' SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local' # SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098' SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
SELENIUM_DRIVER_ARGUMENTS = [ SELENIUM_DRIVER_ARGUMENTS = [
'--headless', '--headless',
'--no-sandbox', '--no-sandbox',

View File

@ -1,24 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import logging as logger import logging as logger
import random
import re import re
import time
from urllib import parse from urllib import parse
import redis import redis
import scrapy import scrapy
from scrapy_selenium import SeleniumRequest from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from MediaSpiders.items import MediaspidersItem from MediaSpiders.items import MediaspidersItem
from MediaSpiders.utils.http_utils import http_post from MediaSpiders.utils.http_utils import http_post
from MediaSpiders.utils.login_utils import login from MediaSpiders.utils.login_utils import login
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
from selenium.webdriver.common.action_chains import ActionChains
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
@ -42,8 +35,8 @@ class TwitterSpider(scrapy.Spider):
'IMAGES_RESULT_FIELD': 'es_urlimage', 'IMAGES_RESULT_FIELD': 'es_urlimage',
'FILES_STORE': r'/usr/local/videos', 'FILES_STORE': r'/usr/local/videos',
'FILES_RESULT_FIELD': 'es_video', 'FILES_RESULT_FIELD': 'es_video',
'ZIP_FILE_NAME': 'image_data_publicinfo_', 'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称
'FILE_ZIP_FILE_NAME': 'image_data_plane_', 'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称
'ITEM_PIPELINES': { 'ITEM_PIPELINES': {
'scrapy.pipelines.images.ImagesPipeline': 2, 'scrapy.pipelines.images.ImagesPipeline': 2,
'scrapy.pipelines.files.FilesPipeline': 1, 'scrapy.pipelines.files.FilesPipeline': 1,
@ -79,32 +72,34 @@ class TwitterSpider(scrapy.Spider):
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
password=self.settings['REDIS_PWD']) password=self.settings['REDIS_PWD'])
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY'] self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
cookie_string = None
# 获取采集登录账号并登录 # 获取采集登录账号并登录
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts') login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
# 尝试自动化登录网页获取 cookies若失败则从redis中 使用已有cookies # 从redis中 使用已有cookies否则自动化登录网页获取cookies
try: cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
ct0 = None
driver = login().login_with_selenium( if cookie_string:
'https://x.com/i/flow/login',
self.name,
login_users=login_users,
response=response
)
cookies = driver.get_cookies()
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token
self.cookie_dict = {}
for cookie in cookies:
self.cookie_dict[cookie['name']] = cookie['value']
except Exception as e:
logger.info("自动化获取cookies失败")
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
self.cookie_dict = form_cookie_dict(cookie_string) self.cookie_dict = form_cookie_dict(cookie_string)
# 5. 构建 headers # 5. 构建 headers
ct0 = self.cookie_dict.get('ct0') ct0 = self.cookie_dict.get('ct0')
if not ct0: if not ct0:
logger.error("redis中cookie缺失ct0 (CSRF token)") logger.error("redis中cookie缺失ct0 (CSRF token)")
return return
else:
try:
driver = login().login_with_selenium(
'https://x.com/i/flow/login',
self.name,
login_users=login_users,
response=response
)
cookies = driver.get_cookies()
# 取cookie中的ct0为x-csrf-token取gt为x-guest-token
self.cookie_dict = {}
for cookie in cookies:
self.cookie_dict[cookie['name']] = cookie['value']
except Exception as e:
logger.info("自动化获取cookies失败")
self.header = { self.header = {
'Host': 'api.twitter.com', 'Host': 'api.twitter.com',
@ -137,7 +132,7 @@ class TwitterSpider(scrapy.Spider):
yield scrapy.Request(url=graphql_url, callback=self.parse, yield scrapy.Request(url=graphql_url, callback=self.parse,
meta={ meta={
'uid': user_info['userUid'], 'uid': user_info['userUid'],
'proxy': 'http://127.0.0.1:10809', # 'proxy': 'http://127.0.0.1:10808',
'currentCount': 0 'currentCount': 0
}, },
cookies=self.cookie_dict, headers=self.header) cookies=self.cookie_dict, headers=self.header)

View File

@ -12,8 +12,9 @@ SCHEDULER_PERSIST = True
SELENIUM_DRIVER_NAME = 'firefox' SELENIUM_DRIVER_NAME = 'firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH = [ SELENIUM_DRIVER_EXECUTABLE_PATH = [
'http://10.55.13.121:28095', 'http://10.55.13.121:28095',
'http://10.55.13.108:28095', # 'http://10.55.13.108:28095',
'http://10.55.13.3:28095', 'http://10.55.13.3:28095',
'http://74.121.148.204:28095'
] ]
SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox
SELENIUM_DRIVER_PREFERENCES = { SELENIUM_DRIVER_PREFERENCES = {
@ -168,7 +169,7 @@ ITEM_PIPELINES = {
} }
############################## 翻译 ############################## 翻译
MAX_TEXT_LENGTH = 100 MAX_TEXT_LENGTH = 5999
# 翻译 API 地址(替换为你的服务器 IP 或域名) # 翻译 API 地址(替换为你的服务器 IP 或域名)
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate" TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
# 单次请求间隔(秒),避免 API 被限流 # 单次请求间隔(秒),避免 API 被限流

View File

@ -137,7 +137,7 @@ def get_format_time(pattern, time_str):
if __name__ == '__main__': if __name__ == '__main__':
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日'] # a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
a = ['2026년 1월 6일 화요일 1면 [사진있음]'] a = ['Wed, 12/03/2025 - 12:00']
for _ in a: for _ in a:
# print(get_time_stamp(_)) print(get_time_stamp(_))
print(get_time_stamp(_, {r"(\d{4}\d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']})) # print(get_time_stamp(_, {r"(\d{2}.\d{2}.\d{4})\D*(\d{2}\d{2}\d{2})*\D*": ['%d-%m-%Y %H:%M:%S']}))

View File

@ -73,3 +73,6 @@ def update_record(cursor, es_sid: int, new_title: str, new_content: str):
WHERE es_sid = % s WHERE es_sid = % s
""" """
cursor.execute(update_query, (new_title, new_content, es_sid)) cursor.execute(update_query, (new_title, new_content, es_sid))
if __name__ == "__main__":
print(translate_content_with_paragraphs("ВСУ провалили наступление на Сумском и Харьковском направлениях, сообщили РИА Новости в силовых структурах. В результате слаженных действий российских бойцов контратаки отражены, а противник обращен в бегство. Введенные ЕС ограничения на передвижения российских дипломатов противоречат Венской конвенции о дипломатических сношениях и мешают нормальной работе дипмиссий. Об этом заявил РИА Новости посол России в Бельгии Денис Гончар. Вице-президент США Джей Ди Вэнс посетит с визитом Армению и Азербайджан. Поездка в Ереван состоится 9-10 февраля, в Баку 10-11 февраля. В Вашингтон Вэнс вернется \"в среду вечером\", сообщает его пресс-пул. Либерально-демократическая партия под руководством премьер-министра Японии Санаэ Такаити победила на выборах в ключевую нижнюю палату парламента. Представители ЛДП получат 316 из 465 мандатов и смогут проводить законопроекты, даже если они не получат поддержки верхней палаты, где партия не имеет большинства. В России самая низкая безработица в странах \"Большой двадцатки\", выяснило РИА Новости, изучив данные национальных статслужб по итогам 2025 года. Уровень безработицы в России в декабре составил 2,2 процента, что на одну десятую процента ниже показателя 2024 года."))