Compare commits
No commits in common. "cfe6c3af8541f0e653b36b509fcbf992de9f8afb" and "d3a46db615c7080609be3c809cc1f4f7f1cac430" have entirely different histories.
cfe6c3af85
...
d3a46db615
@ -21,11 +21,14 @@ DB_CONFIG = {
|
|||||||
TRANSLATE_API_URL = "http://47.113.231.200:28081/translate"
|
TRANSLATE_API_URL = "http://47.113.231.200:28081/translate"
|
||||||
|
|
||||||
# 指定时间(格式:YYYY-MM-DD HH:MM:SS)
|
# 指定时间(格式:YYYY-MM-DD HH:MM:SS)
|
||||||
LOADTIME_AFTER = "2026-02-10 11:59:00"
|
LOADTIME_AFTER = "2026-01-16 10:40:00"
|
||||||
|
|
||||||
# 目标站点列表
|
# 目标站点列表
|
||||||
TARGET_SRCNAMES = [
|
TARGET_SRCNAMES = [
|
||||||
'https://www.38north.org/' # 添加你的站点
|
'http://www.rodong.rep.kp/ko/index.php?MUBAMUAxQA==',
|
||||||
|
'http://www.kcna.kp/kp/category/articles/q/5394b80bdae203fadef02522cfb578c0.kcmsf',
|
||||||
|
'https://energynow.com/category/press_releases/',
|
||||||
|
'https://www.fao.org/newsroom/en' # 添加你的站点
|
||||||
]
|
]
|
||||||
|
|
||||||
# 单次请求间隔(秒),避免 API 被限流
|
# 单次请求间隔(秒),避免 API 被限流
|
||||||
@ -101,7 +104,7 @@ def translate_content_with_paragraphs(content: str) -> str:
|
|||||||
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
||||||
update_query = """
|
update_query = """
|
||||||
UPDATE indeximos
|
UPDATE indeximos
|
||||||
SET es_abstract = % s, es_content = % s
|
SET es_title = % s, es_content = % s
|
||||||
WHERE es_sid = % s
|
WHERE es_sid = % s
|
||||||
"""
|
"""
|
||||||
cursor.execute(update_query, (new_title, new_content, es_sid))
|
cursor.execute(update_query, (new_title, new_content, es_sid))
|
||||||
@ -119,8 +122,8 @@ def main():
|
|||||||
SELECT es_sid, es_urltitle, es_urlcontent
|
SELECT es_sid, es_urltitle, es_urlcontent
|
||||||
FROM indeximos
|
FROM indeximos
|
||||||
WHERE es_loadtime > %s
|
WHERE es_loadtime > %s
|
||||||
AND (es_content IS NULL OR TRIM(es_content) = '')
|
AND (es_title IS NULL OR TRIM(es_title) = '')
|
||||||
-- AND es_srcname IN ({placeholders})
|
AND es_srcname IN ({placeholders})
|
||||||
AND LENGTH(es_video) > 5
|
AND LENGTH(es_video) > 5
|
||||||
"""
|
"""
|
||||||
params = [LOADTIME_AFTER] + TARGET_SRCNAMES
|
params = [LOADTIME_AFTER] + TARGET_SRCNAMES
|
||||||
|
|||||||
@ -76,7 +76,7 @@ class SeleniumMiddleware:
|
|||||||
}
|
}
|
||||||
edge_options.add_experimental_option("prefs", prefs)
|
edge_options.add_experimental_option("prefs", prefs)
|
||||||
|
|
||||||
self.driver = Edge(executable_path=r"C:\Program Files\Python38\msedgedriver.exe", options=edge_options)
|
self.driver = Edge(executable_path="C:/Users/DELL/Downloads/edgedriver_win64/msedgedriver.exe", options=edge_options)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler):
|
def from_crawler(cls, crawler):
|
||||||
|
|||||||
@ -110,8 +110,8 @@ CUSTOM_USER_AGENT = [
|
|||||||
|
|
||||||
# 部署在外网采集fb时使用selenium_chrome
|
# 部署在外网采集fb时使用selenium_chrome
|
||||||
SELENIUM_DRIVER_NAME = 'chrome'
|
SELENIUM_DRIVER_NAME = 'chrome'
|
||||||
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
SELENIUM_DRIVER_EXECUTABLE_PATH = 'local'
|
||||||
SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
# SELENIUM_DRIVER_EXECUTABLE_PATH = 'http://144.34.185.108:28098'
|
||||||
SELENIUM_DRIVER_ARGUMENTS = [
|
SELENIUM_DRIVER_ARGUMENTS = [
|
||||||
'--headless',
|
'--headless',
|
||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
|
|||||||
@ -1,17 +1,24 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import json
|
import json
|
||||||
import logging as logger
|
import logging as logger
|
||||||
|
import random
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from urllib import parse
|
from urllib import parse
|
||||||
|
|
||||||
import redis
|
import redis
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_selenium import SeleniumRequest
|
from scrapy_selenium import SeleniumRequest
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
from MediaSpiders.items import MediaspidersItem
|
from MediaSpiders.items import MediaspidersItem
|
||||||
from MediaSpiders.utils.http_utils import http_post
|
from MediaSpiders.utils.http_utils import http_post
|
||||||
from MediaSpiders.utils.login_utils import login
|
from MediaSpiders.utils.login_utils import login
|
||||||
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
from MediaSpiders.utils.time_utils import get_time_stamp, get_current_timestamp
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
|
||||||
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
from MediaSpiders.utils.traslate_utils import translate_single, translate_content_with_paragraphs, needs_translation
|
||||||
|
|
||||||
|
|
||||||
@ -35,8 +42,8 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
'IMAGES_RESULT_FIELD': 'es_urlimage',
|
||||||
'FILES_STORE': r'/usr/local/videos',
|
'FILES_STORE': r'/usr/local/videos',
|
||||||
'FILES_RESULT_FIELD': 'es_video',
|
'FILES_RESULT_FIELD': 'es_video',
|
||||||
'ZIP_FILE_NAME': 'image_data_ship_', # 图片包名称
|
'ZIP_FILE_NAME': 'image_data_publicinfo_',
|
||||||
'FILE_ZIP_FILE_NAME': 'image_data_plane_', # 视频包名称
|
'FILE_ZIP_FILE_NAME': 'image_data_plane_',
|
||||||
'ITEM_PIPELINES': {
|
'ITEM_PIPELINES': {
|
||||||
'scrapy.pipelines.images.ImagesPipeline': 2,
|
'scrapy.pipelines.images.ImagesPipeline': 2,
|
||||||
'scrapy.pipelines.files.FilesPipeline': 1,
|
'scrapy.pipelines.files.FilesPipeline': 1,
|
||||||
@ -72,34 +79,32 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
self.redis_client = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'],
|
||||||
password=self.settings['REDIS_PWD'])
|
password=self.settings['REDIS_PWD'])
|
||||||
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
self.simhash_filter_key = self.settings['TWITTER_SIMHASH_FILTER_KEY']
|
||||||
|
cookie_string = None
|
||||||
# 获取采集登录账号并登录
|
# 获取采集登录账号并登录
|
||||||
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
login_users = self.redis_client.smembers('MediaSpiders:Twitter_login_accounts')
|
||||||
# 从redis中 使用已有cookies,否则自动化登录网页获取cookies
|
# 尝试自动化登录网页获取 cookies,若失败则从redis中 使用已有cookies
|
||||||
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
try:
|
||||||
ct0 = None
|
|
||||||
if cookie_string:
|
driver = login().login_with_selenium(
|
||||||
|
'https://x.com/i/flow/login',
|
||||||
|
self.name,
|
||||||
|
login_users=login_users,
|
||||||
|
response=response
|
||||||
|
)
|
||||||
|
cookies = driver.get_cookies()
|
||||||
|
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
||||||
|
self.cookie_dict = {}
|
||||||
|
for cookie in cookies:
|
||||||
|
self.cookie_dict[cookie['name']] = cookie['value']
|
||||||
|
except Exception as e:
|
||||||
|
logger.info("自动化获取cookies失败")
|
||||||
|
cookie_string = self.redis_client.get("MediaSpiders:Twitter_Cookies").decode()
|
||||||
self.cookie_dict = form_cookie_dict(cookie_string)
|
self.cookie_dict = form_cookie_dict(cookie_string)
|
||||||
# 5. 构建 headers
|
# 5. 构建 headers
|
||||||
ct0 = self.cookie_dict.get('ct0')
|
ct0 = self.cookie_dict.get('ct0')
|
||||||
if not ct0:
|
if not ct0:
|
||||||
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
logger.error("redis中cookie缺失ct0 (CSRF token)!")
|
||||||
return
|
return
|
||||||
else:
|
|
||||||
try:
|
|
||||||
|
|
||||||
driver = login().login_with_selenium(
|
|
||||||
'https://x.com/i/flow/login',
|
|
||||||
self.name,
|
|
||||||
login_users=login_users,
|
|
||||||
response=response
|
|
||||||
)
|
|
||||||
cookies = driver.get_cookies()
|
|
||||||
# 取cookie中的ct0为x-csrf-token,取gt为x-guest-token
|
|
||||||
self.cookie_dict = {}
|
|
||||||
for cookie in cookies:
|
|
||||||
self.cookie_dict[cookie['name']] = cookie['value']
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("自动化获取cookies失败")
|
|
||||||
|
|
||||||
self.header = {
|
self.header = {
|
||||||
'Host': 'api.twitter.com',
|
'Host': 'api.twitter.com',
|
||||||
@ -132,7 +137,7 @@ class TwitterSpider(scrapy.Spider):
|
|||||||
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
yield scrapy.Request(url=graphql_url, callback=self.parse,
|
||||||
meta={
|
meta={
|
||||||
'uid': user_info['userUid'],
|
'uid': user_info['userUid'],
|
||||||
# 'proxy': 'http://127.0.0.1:10808',
|
'proxy': 'http://127.0.0.1:10809',
|
||||||
'currentCount': 0
|
'currentCount': 0
|
||||||
},
|
},
|
||||||
cookies=self.cookie_dict, headers=self.header)
|
cookies=self.cookie_dict, headers=self.header)
|
||||||
|
|||||||
@ -12,9 +12,8 @@ SCHEDULER_PERSIST = True
|
|||||||
SELENIUM_DRIVER_NAME = 'firefox'
|
SELENIUM_DRIVER_NAME = 'firefox'
|
||||||
SELENIUM_DRIVER_EXECUTABLE_PATH = [
|
SELENIUM_DRIVER_EXECUTABLE_PATH = [
|
||||||
'http://10.55.13.121:28095',
|
'http://10.55.13.121:28095',
|
||||||
# 'http://10.55.13.108:28095',
|
'http://10.55.13.108:28095',
|
||||||
'http://10.55.13.3:28095',
|
'http://10.55.13.3:28095',
|
||||||
'http://74.121.148.204:28095'
|
|
||||||
]
|
]
|
||||||
SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox
|
SELENIUM_DRIVER_ARGUMENTS = ['-headless'] # '--headless' if using chrome instead of firefox
|
||||||
SELENIUM_DRIVER_PREFERENCES = {
|
SELENIUM_DRIVER_PREFERENCES = {
|
||||||
@ -169,7 +168,7 @@ ITEM_PIPELINES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
############################## 翻译
|
############################## 翻译
|
||||||
MAX_TEXT_LENGTH = 5999
|
MAX_TEXT_LENGTH = 100
|
||||||
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
# 翻译 API 地址(替换为你的服务器 IP 或域名)
|
||||||
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
|
TRANSLATE_API_URL = "http://47.113.231.200:28082/translate"
|
||||||
# 单次请求间隔(秒),避免 API 被限流
|
# 单次请求间隔(秒),避免 API 被限流
|
||||||
|
|||||||
@ -137,7 +137,7 @@ def get_format_time(pattern, time_str):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
# a = [' 令和4年6月9日', 'www.kcna.kp (主体111.6.6.)', '民國111年06月09日 ', 'Jun. 9, 2022', '111年 06月 21日']
|
||||||
a = ['Wed, 12/03/2025 - 12:00']
|
a = ['2026년 1월 6일 화요일 1면 [사진있음]']
|
||||||
for _ in a:
|
for _ in a:
|
||||||
print(get_time_stamp(_))
|
# print(get_time_stamp(_))
|
||||||
# print(get_time_stamp(_, {r"(\d{2}.\d{2}.\d{4})\D*(\d{2}\d{2}\d{2})*\D*": ['%d-%m-%Y %H:%M:%S']}))
|
print(get_time_stamp(_, {r"(\d{4}년 \d{1,2}월 \d{1,2}일)\D*(\d{2}:\d{2}:\d{2})*\D*": ['%Y-%m-%d %H:%M:%S']}))
|
||||||
|
|||||||
@ -73,6 +73,3 @@ def update_record(cursor, es_sid: int, new_title: str, new_content: str):
|
|||||||
WHERE es_sid = % s
|
WHERE es_sid = % s
|
||||||
"""
|
"""
|
||||||
cursor.execute(update_query, (new_title, new_content, es_sid))
|
cursor.execute(update_query, (new_title, new_content, es_sid))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(translate_content_with_paragraphs("ВСУ провалили наступление на Сумском и Харьковском направлениях, сообщили РИА Новости в силовых структурах. В результате слаженных действий российских бойцов контратаки отражены, а противник обращен в бегство. Введенные ЕС ограничения на передвижения российских дипломатов противоречат Венской конвенции о дипломатических сношениях и мешают нормальной работе дипмиссий. Об этом заявил РИА Новости посол России в Бельгии Денис Гончар. Вице-президент США Джей Ди Вэнс посетит с визитом Армению и Азербайджан. Поездка в Ереван состоится 9-10 февраля, в Баку – 10-11 февраля. В Вашингтон Вэнс вернется \"в среду вечером\", сообщает его пресс-пул. Либерально-демократическая партия под руководством премьер-министра Японии Санаэ Такаити победила на выборах в ключевую нижнюю палату парламента. Представители ЛДП получат 316 из 465 мандатов и смогут проводить законопроекты, даже если они не получат поддержки верхней палаты, где партия не имеет большинства. В России самая низкая безработица в странах \"Большой двадцатки\", выяснило РИА Новости, изучив данные национальных статслужб по итогам 2025 года. Уровень безработицы в России в декабре составил 2,2 процента, что на одну десятую процента ниже показателя 2024 года."))
|
|
||||||
Loading…
x
Reference in New Issue
Block a user