Help former colleagues to solve a demand, Chinese project translation English Project ~ ~ ~
Given the specific implementation of the problem, if smart, it is necessary to do the Chinese grammar analysis, but it is difficult to feel this.
So the final scheme is to traverse the file, the Chinese phrase to match out, and then artificial translation, the Chinese phrase to replace the content of the translation. Of course, later still need to manually test, after all, the code in Chinese, may affect the relevant procedures.
This problem, obviously involves multithreading, file reading and writing, the first time to think of is Nodejs, although Nodejs is a main thread, but asynchronous file read and write, event response mechanism, must also call the thread, in the actual programming time does not need to consider threading related issues.
The code is not complex as follows, after written, the appropriate encapsulation of the next
var fs = require (' FS ');
var http = require (' http ');
var filePath = ' d:work_new ';
var logPath = ' D:chinese.log ';
var map = {};
var num = 0;
var dictionary = (function () {
var map = {};
return {
LogPath: ' D:chinese.log ',
Set:function (Key, Val) {
Map[key] = Val | | '';
},
Get:function (key) {
return map[key]| | ';
},
Save2file:function () {
Fs.writefile (This.logpath, json.stringify (map). Replace (/","/g, ' ", RN"), {encoding: ' UTF8 ', flag: ' W '}, function (ERR) {
if (err) throw err;
});
},
Loadfile:function (callback) {
Fs.readfile (This.logpath, {encoding: ' UTF8 '},function (err, data) {
Map = json.parse (data);
Callback ();
})
},
Translatebygoogle:function (callback) {
var index = 0;
for (var key in map) {
if (map[key] = = ") {
index++;
(function (key) {
Http.get ("Http://translate.google.cn/translate_a/t?client=t&hl=zh-CN&sl=zh-CN&tl=en&ie=UTF-8 &oe=utf-8&oc=2&otf=1&ssel=3&tsel=6&sc=2&q= "+key, function (res) {
Res.setencoding (' UTF8 ');
var body = "";
Res.on (' Data ', function (chunk) {
Body+=chunk;
. On (' End '), function () {
var obj = eval (' (' +body+ ') ');
Map[key] = obj[0][0][0];
index--;
if (index = = 0) {
Callback ();
}
});
}. On (' Error ', function (e) {
Console.log (' HTTP error ');
index--;
if (index = = 0) {
Callback ();
}
Console.log ("Got error:" + e.message);
});
}) (key);
}
}
}
}
})();
function File () {
var index = 0;
var _readfile = function (Pathstr, Fileback, Doneback) {
Fs.readfile (pathstr,{encoding: ' UTF8 '}, function (err, data) {
index--;
if (err) {
data = "";
Console.log (ERR,PATHSTR)
throw err;
}
Fileback (DATA,PATHSTR);
if (index = = 0) {
Doneback ();
}
});
};
var _walkdir = function (Pathstr, Fileback, Doneback) {
Fs.readdir (PATHSTR, function (err, files) {
Files.foreach (function (file) {
if (Fs.statsync (pathstr + '/' + file). Isdirectory ()) {
_walkdir (pathstr + '/' + file, Fileback, doneback);
} else {
if (/.js$|. html$|. htm$|. Jsp$/.test (file)) {
Index + +;
_readfile (pathstr + '/' + file, Fileback, doneback);
}
Return
}
});
});
}
This.walkdir = function (Pathstr, Fileback, Doneback) {
index = 0;
_walkdir (Pathstr, Fileback, Doneback);
}
}
The first step to get Chinese
Dictionary.logpath = LogPath;
New File (). Walkdir (FilePath, function (data) {
if (!! Data) {
var match = Data.match (/[u4e00-u9faf]+/g);
if (!! Match) {
Match.foreach (function (MAT) {
Dictionary.set (MAT);
})
}
}
}, function () {
Console.log (' Get Chinese OK ');
Dictionary.save2file ();
})
The second step is Google translation
/*
Dictionary.loadfile (function () {
Dictionary.translatebygoogle (function () {
Dictionary.save2file ();
})
});
*/
Third Step Chinese substitution
/*
Dictionary.loadfile (function () {
New File (). Walkdir (FilePath, function (DATA,PATHSTR) {
Fs.writefile (Pathstr, data.replace (/[u4e00-u9faf]+/g, function (CH) {
return Dictionary.get (CH);
}), {encoding: ' ASCII ', flag: ' W '}, Function (err) {
if (err) throw err;
});
}, function () {
Console.log (' Chinese substitution OK ');
})
});
*/
There's still some problems.
1.nodejs coding problem, in window environment for GBK coding support is not good, mainly UTF8 file processing
2. Efficiency may be further optimized by threading, this piece is not considered in depth
3. Match out, may have a single punctuation phrase and so on, need manual investigation
In fact, the file is GBK, and some files are utf8, and then when you think about it through the scripting language,
1. File coding problems, judging by search
Judge file first 3 bytes is not EF BB BF, but this is only for the UTF8 format with BOM
For UTF8 format without BOM, it is necessary to judge the byte signature (difficulty, limited energy, use the above scheme, for no BOM situation, Carry on manual investigation).
2. Because the quick multithreading is easy to program, it is always thought that multithreading is better than single-threaded efficiency. The actual situation is not the same as thinking, single-threaded faster than multithreading. It seems the main bottleneck is still on the read and write file IO.
The above mentioned is the entire content of this article, I hope you can enjoy.