nodejs简单实现中英文翻译

2024-04-01 06:58:03 317

帮以前同事解决一个需求，中文项目翻译英文项目~~~

考虑到具体实现方面的问题，如果智能的话，肯定是要做中文的语法分析，不过感觉这个有难度。

所以最后的方案是遍历文件，将中文短语匹配出来，再进行人工翻译，将中文短语替换成翻译的内容。当然后期还是需要人工再检验下，毕竟代码中的中文，可能会影响到相关的程序。

这个问题，明显涉及到多线程，文件读写，第一时间就想到的是nodejs，虽然nodejs是一个主线程，但是异步文件读写，事件响应机制，肯定也是调用了线程，在实际编程的时候不需要考虑线程的相关的问题。

代码不复杂如下，写完了之后，适当的封装了下

varfs=require('fs');
varhttp=require('http');
varfilePath='D:\\WORK_new\\';
varlogPath='D:\\chinese.log';

varmap={};
varnum=0;

vardictionary=(function(){
varmap={};
return{
logPath:'D:\\chinese.log',
set:function(key,val){
map[key]=val||'';
},
get:function(key){
returnmap[key]||'';
},
save2File:function(){
fs.writeFile(this.logPath,JSON.stringify(map).replace(/","/g,'",\r\n"'),{encoding:'utf8',flag:'w'},function(err){
if(err)throwerr;
});
},
loadFile:function(callback){
fs.readFile(this.logPath,{encoding:'utf8'},function(err,data){
map=JSON.parse(data);
callback();
})
},
translateByGoogle:function(callback){
varindex=0;
for(varkeyinmap){
if(map[key]==''){
index++;
(function(key){
http.get("http://translate.google.cn/translate_a/t?client=t&hl=zh-CN&sl=zh-CN&tl=en&ie=UTF-8&oe=UTF-8&oc=2&otf=1&ssel=3&tsel=6&sc=2&q="+key,function(res){
res.setEncoding('utf8');
varbody="";
res.on('data',function(chunk){
body+=chunk;
}).on('end',function(){
varobj=eval('('+body+')');
map[key]=obj[0][0][0];
index--;
if(index==0){
callback();
}
});
}).on('error',function(e){
console.log('httperror');
index--;
if(index==0){
callback();
}
console.log("Goterror:"+e.message);
});
})(key);
}
}
}
}
})();

functionFile(){
varindex=0;
var_readFile=function(pathStr,fileBack,doneBack){
fs.readFile(pathStr,{encoding:'utf8'},function(err,data){
index--;
if(err){
data="";
console.log(err,pathStr)
//throwerr;
}
fileBack(data,pathStr);
if(index==0){
doneBack();
}
});
};
var_walkDir=function(pathStr,fileBack,doneBack){
fs.readdir(pathStr,function(err,files){
files.forEach(function(file){
if(fs.statSync(pathStr+'/'+file).isDirectory()){
_walkDir(pathStr+'/'+file,fileBack,doneBack);
}else{
if(/.js$|.html$|.htm$|.jsp$/.test(file)){
index++;
_readFile(pathStr+'/'+file,fileBack,doneBack);
}
return;
}
});
});
}
this.walkDir=function(pathStr,fileBack,doneBack){
index=0;
_walkDir(pathStr,fileBack,doneBack);
}
}

//第一步获取中文
dictionary.logPath=logPath;

newFile().walkDir(filePath,function(data){
if(!!data){
varmatch=data.match(/[\u4e00-\u9faf]+/g);
if(!!match){
match.forEach(function(mat){
dictionary.set(mat);
})
}
}
},function(){
console.log('获取中文OK');
dictionary.save2File();
})


//第二步google翻译
/*
dictionary.loadFile(function(){
dictionary.translateByGoogle(function(){
dictionary.save2File();
})
});
*/
//第三步中文替换
/*
dictionary.loadFile(function(){
newFile().walkDir(filePath,function(data,pathStr){
fs.writeFile(pathStr,data.replace(/[\u4e00-\u9faf]+/g,function(ch){
returndictionary.get(ch);
}),{encoding:'ascii',flag:'w'},function(err){
if(err)throwerr;
});
},function(){
console.log('中文替换OK');
})
});
*/

问题还是有的

1.nodejs编码问题，在window环境下对GBK编码支持不好，主要是utf8文件的处理

2.效率上面可能可以再通过线程进行优化，这块没做深入的考虑

3.匹配出来，可能有单个的标点符号的短语等情况，需要人工排查

实际情况中，文件是GBK的，还有些文件是utf8的，后来还是考虑通过脚本语言快手实现的时候，

1.文件编码的问题，判断通过搜索

判断文件首位3个字节是不是efbbbf，但是这个只是针对有BOM的utf8格式

对无BOM的utf8格式，需要进行字节特征码的判断（有难度，精力有限，使用了上面的方案，对于无BOM的情况，进行人工排查）。

2.因为快手多线程方便编程很简单，一直以为多线程肯定比单线程效率要好。实际情况却和想的不一样，单线程的比多线程的快多了。看来主要瓶颈还是在读写文件IO上面。

以上所述就是本文全部内容了，希望大家能够喜欢。

nodejs简单实现中英文翻译

热门推荐

随机推荐