Python 实现的 Google 批量翻译功能
首先声明,没有什么不良动机,因为经常会用translate.google.cn,就想着用Python模拟网页提交实现文档的批量翻译。据说有API,可是要收费。
生成Token
Google为防爬虫而生成token的代码是Javascript的,且是根据网站的TKK值和提交的文本动态生成。更新规律未知,只好定时去取一下了。
网上能找到的Python代码大部分是去调用PyExecJS库,先不说执行效率的高低(大概是差一个数量级),首先是舍近求远,不纯粹,本人不喜欢。
好不容易找到了一段Python代码还有点小Bug,且缺少动态获取TKK的步骤。最后还是对照Javascript代码自己改成Python了。方法很简单,先转成易懂的Javascript,再转成Python。Javascript代码来自C#实现谷歌翻译API。
原始(晦涩)Javascript代码
varb=function(a,b){
for(vard=0;d>>c:a<c?g[d++]=c:(2048>c?g[d++]=c>>6|192:(55296==(c&64512)&&f+1>18|240,g[d++]=c>>12&63|128):g[d++]=c>>12|224,g[d++]=c>>6&63|128),g[d++]=c&63|128)
}
a=h;
for(d=0;da&&(a=(a&2147483647)+2147483648);
a%=1E6;
returna.toString()+"."+(a^h)
}
易懂的Javascript代码
functionRL(a,b){
for(vard=0;d>>c:a<c)
{
g[d++]=c;
}
else
{
if(2048>c)
{
g[d++]=c>>6|192;
}
else
{
if(55296==(c&64512)&&f+1>18|240;
g[d++]=c>>12&63|128;
}
else
{
g[d++]=c>>12|224;
g[d++]=c>>6&63|128;
}
}
g[d++]=c&63|128;
}
}
a=h;
for(vard=0;da&&(a=(a&2147483647)+2147483648);
a%=1E6;
returna.toString()+"."+(a^h)
}
Python代码
defgetGoogleToken(a,TKK): defRL(a,b): fordinrange(0,len(b)-2,3): c=b[d+2] c=ord(c[0])-87if'a'<=celseint(c) c=a>>cif'+'==b[d+1]elsea<c: g.append(c) else: if2048>c: g.append((c>>6)|192) else: if(55296==(c&64512))and(f+1 >18)|240) g.append((c>>12)&63|128) else: g.append((c>>12)|224) g.append((c>>6)&63|128) g.append((c&63)|128) f+=1 e=TKK.split('.') h=int(e[0])or0 t=h foriteming: t+=item t=RL(t,'+-a^+6') t=RL(t,'+-3^+b+-f') t^=int(e[1])or0 if0>t: t=(t&2147483647)+2147483648 result=t%1000000 returnstr(result)+'.'+str(result^h)
获取TokenKey
Google的TKK可以通过访问网站https://translate.google.cn获取,里面有段脚本里包含了“tkk:('xxxxxx.xxxxxx')”,用正则表达式截取即可。
res=requests.get('https://translate.google.cn',timeout=3)
res.raise_for_status()
result=re.search(r'tkk\:\'(\d+\.\d+)?\'',res.text).group(1)
划分文章段落
因为常从PDF里复制文本翻译,这样就不能依赖换行符来划分段落了。只能判断空行,作为段落的分界。
另外Google返回的结果Json里,会以英文句点作为分隔符,每一句译文均作为数组的一项分开。所以最后得合并一下,成为一个段落。
完整代码
代码不长,全文黏贴如下。
GoogleTranslator.py:
importrequests
importre
importjson
importtime
classGoogleTranslator():
_host='translate.google.cn'
_headers={
'Host':_host,
'User-Agent':'Mozilla/5.0(Linux;Android6.0;Nexus5Build/MRA58N)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.100MobileSafari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip,deflate,br',
'Content-Type':'application/x-www-form-urlencoded;charset=utf-8',
'Referer':'https://'+_host,
'Connection':'keep-alive',
'Cache-Control':'max-age=0'
}
_language={
'afrikaans':'af',
'arabic':'ar',
'belarusian':'be',
'bulgarian':'bg',
'catalan':'ca',
'czech':'cs',
'welsh':'cy',
'danish':'da',
'german':'de',
'greek':'el',
'english':'en',
'esperanto':'eo',
'spanish':'es',
'estonian':'et',
'persian':'fa',
'finnish':'fi',
'french':'fr',
'irish':'ga',
'galician':'gl',
'hindi':'hi',
'croatian':'hr',
'hungarian':'hu',
'indonesian':'id',
'icelandic':'is',
'italian':'it',
'hebrew':'iw',
'japanese':'ja',
'korean':'ko',
'latin':'la',
'lithuanian':'lt',
'latvian':'lv',
'macedonian':'mk',
'malay':'ms',
'maltese':'mt',
'dutch':'nl',
'norwegian':'no',
'polish':'pl',
'portuguese':'pt',
'romanian':'ro',
'russian':'ru',
'slovak':'sk',
'slovenian':'sl',
'albanian':'sq',
'serbian':'sr',
'swedish':'sv',
'swahili':'sw',
'thai':'th',
'filipino':'tl',
'turkish':'tr',
'ukrainian':'uk',
'vietnamese':'vi',
'yiddish':'yi',
'chinese_simplified':'zh-CN',
'chinese_traditional':'zh-TW',
'auto':'auto'
}
_url='https://'+_host+'/translate_a/single'
_params={
'client':'webapp',
'sl':'en',
'tl':'zh-CN',
'hl':'zh-CN',
'dt':'at',
'dt':'bd',
'dt':'ex',
'dt':'ld',
'dt':'md',
'dt':'qca',
'dt':'rw',
'dt':'rm',
'dt':'ss',
'dt':'t',
'otf':'1',
'ssel':'0',
'tsel':'0',
'kc':'1'
}
__cookies=None
__googleTokenKey='376032.257956'
__googleTokenKeyUpdataTime=600.0
__googleTokenKeyRetireTime=time.time()+600.0
def__init__(self,src='en',dest='zh-CN',tkkUpdataTime=600.0):
ifsrcnotinself._languageandsrcnotinself._language.values():
src='auto'
ifdestnotinself._languageanddestnotinself._language.values():
dest='auto'
self._params['sl']=src
self._params['tl']=dest
self.googleTokenKeyUpdataTime=tkkUpdataTime
self.__updateGoogleTokenKey()
def__updateGoogleTokenKey(self):
self.__googleTokenKey=self.__getGoogleTokenKey()
self.__googleTokenKeyRetireTime=time.time()+self.__googleTokenKeyUpdataTime
def__getGoogleTokenKey(self):
"""GettheGoogleTKKfromhttps://translate.google.cn"""
#TKKexample:'435075.3634891900'
result=''
try:
res=requests.get('https://'+self._host,timeout=3)
res.raise_for_status()
self.__cookies=res.cookies
result=re.search(r'tkk\:\'(\d+\.\d+)?\'',res.text).group(1)
exceptrequests.exceptions.ReadTimeoutasex:
print('ERROR:'+str(ex))
time.sleep(1)
returnresult
def__getGoogleToken(self,a,TKK):
"""CalculateGoogletkfromTKK"""
#https://www.cnblogs.com/chicsky/p/7443830.html
#iftext='TabletDeveloper'andTKK='435102.3120524463',thentk='315066.159012'
defRL(a,b):
fordinrange(0,len(b)-2,3):
c=b[d+2]
c=ord(c[0])-87if'a'<=celseint(c)
c=a>>cif'+'==b[d+1]elsea<c:
g.append(c)
else:
if2048>c:
g.append((c>>6)|192)
else:
if(55296==(c&64512))and(f+1>18)|240)
g.append((c>>12)&63|128)
else:
g.append((c>>12)|224)
g.append((c>>6)&63|128)
g.append((c&63)|128)
f+=1
e=TKK.split('.')
h=int(e[0])or0
t=h
foriteming:
t+=item
t=RL(t,'+-a^+6')
t=RL(t,'+-3^+b+-f')
t^=int(e[1])or0
if0>t:
t=(t&2147483647)+2147483648
result=t%1000000
returnstr(result)+'.'+str(result^h)
deftranslate(self,text):
iftime.time()>self.__googleTokenKeyRetireTime:
self.__updateGoogleTokenKey()
data={'q':text}
self._params['tk']=self.__getGoogleToken(text,self.__googleTokenKey)
result=''
try:
res=requests.post(self._url,
headers=self._headers,
cookies=self.__cookies,
data=data,
params=self._params,
timeout=6)
res.raise_for_status()
jsonText=res.text
iflen(jsonText)>0:
jsonResult=json.loads(jsonText)
iflen(jsonResult[0])>0:
foriteminjsonResult[0]:
result+=item[0]
returnresult
exceptExceptionasex:
print('ERROR:'+str(ex))
return''
importtime
fromGoogleTranslatorimportGoogleTranslator
defreadFile(fileName):
withopen(fileName,'r')asf:
paragraph=''
forlineinf:
ifline[0]!='\n':
paragraph+=line.strip('\n')
else:
iflen(paragraph)>0:
yieldparagraph
paragraph=''
iflen(paragraph)>0:
yieldparagraph
main.py:
defmain():
translator=GoogleTranslator()
count=0
withopen('C:\\dx\\python\\d.txt','w',encoding='utf-8')asdf:
forlineinreadFile('C:\\dx\\python\\s.txt'):
iflen(line)>1:
count+=1
print('\r'+str(count),end='',flush=True)
df.write(line.strip()+"\n")
result=translator.translate(line)
df.write(result.strip()+"\n\n")
if__name__=="__main__":
startTime=time.time()
main()
print()
print('%.2fseconds'%(time.time()-startTime))
总结
以上所述是小编给大家介绍的Python实现的Google批量翻译功能,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对毛票票网站的支持!
如果你觉得本文对你有帮助,欢迎转载,烦请注明出处,谢谢!