osc/research/BaiduTranslateDemo/baidu_translate.py
2025-05-28 19:16:17 +08:00

98 lines
3.7 KiB
Python

import json
import logging
import re
import logging as logger
import execjs
import requests
import selenium
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
class BaiduTranslator:
def __init__(self):
driver_options = Options()
driver_options.add_argument('-headless')
self.browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://39.101.194.63:28050",
desired_capabilities=DesiredCapabilities.EDGE,
options=driver_options)
logging.info('Starting browser session...')
self.browser.get('https://fanyi.baidu.com/translate')
logger.info('Browser session started')
self.trans_str = ''
self.trans_url = "https://fanyi.baidu.com/v2transapi"
self.dict_cookies = self.browser.get_cookies()
self.cookies_str = ''
for item in self.dict_cookies:
self.cookies_str += ('%s=%s; ' % (item['name'], item['value']))
logger.info('Set cookies as %s' % self.cookies_str)
self.headers = {
"user-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
"Cookie": self.cookies_str
}
self.pattern = re.compile(
r"window\['common'\]\W*?=\W*?{\W*?.*?token.*?:.*?'(\w+)',")
self.pattern_gtk = re.compile(r'window.gtk\W*?=\W*?"(.*?)"')
self.token, self.gtk = self.get_token_or_gtk()
logger.info('Set token as %s' % self.token)
logger.info('Set gtk as %s' % self.gtk)
logger.info('Baidu translate api started success')
def parse_url(self, data, url="https://fanyi.baidu.com/langdetect"):
response = requests.post(url, data=data, headers=self.headers)
return json.loads(response.content.decode())
def get_token_or_gtk(self, url="https://fanyi.baidu.com/translate"):
# response = requests.get(url, headers=self.headers)
# page_str = response.content.decode()
self.browser.get(url)
page_str = self.browser.find_element_by_xpath("//*").get_attribute("outerHTML")
token = self.pattern.search(page_str).group(1)
gtk = self.pattern_gtk.search(page_str).group(1)
return token, gtk
def get_sign(self, gtk):
with open("./gen.js", 'r') as f:
js_code = f.read()
ctx = execjs.compile(js_code)
return ctx.call("e", self.trans_str, gtk)
def run(self, trans_str):
self.trans_str = trans_str
lang_detect_data = {"query": self.trans_str}
try:
lang = self.parse_url(lang_detect_data)["lan"]
except KeyError:
lang = 'en'
trans_data = {
"query": self.trans_str,
"from": "zh",
"to": "en"
} if lang == "zh" else {
"query": self.trans_str,
"from": "en",
"to": "zh"
}
sign = self.get_sign(self.gtk)
trans_data.update({
"sign": sign,
"token": self.token,
"transtype": "translang",
"simple_means_flag": 3
})
dict_response = self.parse_url(trans_data, self.trans_url)
try:
# print(dict_response) # 原始结果
result_sentence = ''
for d in dict_response['trans_result']['data']:
result_sentence += d['dst']
print(result_sentence)
except Exception as e:
print('[ERROR {}] {}'.format(repr(e), json.dumps(dict_response, ensure_ascii=False)))
def end_session(self):
self.browser.quit()