decoder_spec.js 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. var should = require('should'),
  2. needle = require('./../'),
  3. decoder = require('./../lib/decoder'),
  4. Q = require('q'),
  5. chardet = require('jschardet'),
  6. fs = require('fs'),
  7. http = require('http'),
  8. helpers = require('./helpers');
  9. describe('character encoding', function() {
  10. this.timeout(5000);
  11. function staticServerFor(file, content_type) {
  12. return http.createServer(function(req, res) {
  13. req.on('data', function(chunk) {})
  14. req.on('end', function() {
  15. // We used to pull from a particular site that is no longer up.
  16. // This is a local mirror pulled from archive.org
  17. // https://web.archive.org/web/20181003202907/http://www.nina.jp/server/slackware/webapp/tomcat_charset.html
  18. fs.readFile(file, function(err, data) {
  19. if (err) {
  20. res.writeHead(404);
  21. res.end(JSON.stringify(err));
  22. return;
  23. }
  24. res.writeHeader(200, { 'Content-Type': content_type })
  25. res.end(data);
  26. });
  27. })
  28. })
  29. }
  30. describe('Given content-type: "text/html; charset=EUC-JP"', function() {
  31. var server, port = 2233;
  32. before(function(done) {
  33. server = staticServerFor('test/files/tomcat_charset.html', 'text/html; charset=EUC-JP')
  34. server.listen(port, done)
  35. url = 'http://localhost:' + port;
  36. })
  37. after(function(done) {
  38. server.close(done)
  39. })
  40. describe('with decode = false', function() {
  41. it('does not decode', function(done) {
  42. needle.get(url, { decode: false }, function(err, resp) {
  43. resp.body.should.be.a.String;
  44. chardet.detect(resp.body).encoding.should.eql('windows-1252');
  45. resp.body.indexOf('EUCを使う').should.eql(-1);
  46. done();
  47. })
  48. })
  49. })
  50. describe('with decode = true', function() {
  51. it('decodes', function(done) {
  52. needle.get(url, { decode: true }, function(err, resp) {
  53. resp.body.should.be.a.String;
  54. chardet.detect(resp.body).encoding.should.eql('ascii');
  55. resp.body.indexOf('EUCを使う').should.not.eql(-1);
  56. done();
  57. })
  58. })
  59. })
  60. })
  61. describe('Given content-type: "text/html but file is charset: gb2312', function() {
  62. it('encodes to UTF-8', function(done) {
  63. // Our Needle wrapper that requests a chinese website.
  64. var task = Q.nbind(needle.get, needle, 'http://www.chinesetop100.com/');
  65. // Different instantiations of this task
  66. var tasks = [Q.fcall(task, {decode: true}),
  67. Q.fcall(task, {decode: false})];
  68. var results = tasks.map(function(task) {
  69. return task.then(function(obj) {
  70. return obj[0].body;
  71. });
  72. });
  73. // Execute all requests concurrently
  74. Q.all(results).done(function(bodies) {
  75. var charsets = [
  76. chardet.detect(bodies[0]).encoding,
  77. chardet.detect(bodies[1]).encoding,
  78. ]
  79. // We wanted to decode our first stream as specified by options
  80. charsets[0].should.equal('ascii');
  81. bodies[0].indexOf('全球中文网站前二十强').should.not.equal(-1);
  82. // But not our second stream
  83. charsets[1].should.equal('windows-1252');
  84. bodies[1].indexOf('全球中文网站前二十强').should.equal(-1);
  85. done();
  86. });
  87. })
  88. })
  89. describe('Given content-type: text/html; charset=maccentraleurope', function() {
  90. var server, port = 2233;
  91. // from 'https://wayback.archive-it.org/3259/20160921140616/https://www.arc.gov/research/MapsofAppalachia.asp?MAP_ID=11';
  92. before(function(done) {
  93. server = staticServerFor('test/files/Appalachia.html', 'text/html; charset=maccentraleurope')
  94. server.listen(port, done)
  95. url = 'http://localhost:' + port;
  96. })
  97. after(function(done) {
  98. server.close(done)
  99. })
  100. describe('with decode = false', function() {
  101. it('does not decode', function(done) {
  102. needle.get(url, { decode: false }, function(err, resp) {
  103. resp.body.should.be.a.String;
  104. chardet.detect(resp.body).encoding.should.eql('ascii');
  105. done();
  106. })
  107. })
  108. })
  109. describe('with decode = true', function() {
  110. it('does not explode', function(done) {
  111. (function() {
  112. needle.get(url, { decode: true }, function(err, resp) {
  113. resp.body.should.be.a.String;
  114. chardet.detect(resp.body).encoding.should.eql('ascii');
  115. done();
  116. })
  117. }).should.not.throw();
  118. })
  119. })
  120. })
  121. describe('Given content-type: "text/html"', function () {
  122. var server,
  123. port = 54321,
  124. text = 'Magyarországi Fióktelepe'
  125. before(function(done) {
  126. server = helpers.server({
  127. port: port,
  128. response: text,
  129. headers: { 'Content-Type': 'text/html' }
  130. }, done);
  131. })
  132. after(function(done) {
  133. server.close(done)
  134. })
  135. describe('with decode = false', function () {
  136. it('decodes by default to utf-8', function (done) {
  137. needle.get('http://localhost:' + port, { decode: false }, function (err, resp) {
  138. resp.body.should.be.a.String;
  139. chardet.detect(resp.body).encoding.should.eql('ISO-8859-2');
  140. resp.body.should.eql('Magyarországi Fióktelepe')
  141. done();
  142. })
  143. })
  144. })
  145. })
  146. describe('multibyte characters split across chunks', function () {
  147. describe('with encoding = utf-8', function() {
  148. var d,
  149. result = Buffer.allocUnsafe(0);
  150. before(function(done) {
  151. d = decoder('utf-8');
  152. done();
  153. });
  154. it('reassembles split multibyte characters', function (done) {
  155. d.on("data", function(chunk){
  156. result = Buffer.concat([ result, chunk ]);
  157. });
  158. d.on("end", function(){
  159. result.toString("utf-8").should.eql('慶');
  160. done();
  161. });
  162. // write '慶' in utf-8 split across chunks
  163. d.write(Buffer.from([0xE6]));
  164. d.write(Buffer.from([0x85]));
  165. d.write(Buffer.from([0xB6]));
  166. d.end();
  167. })
  168. })
  169. describe('with encoding = euc-jp', function() {
  170. var d,
  171. result = Buffer.allocUnsafe(0);
  172. before(function(done) {
  173. d = decoder('euc-jp');
  174. done();
  175. });
  176. it('reassembles split multibyte characters', function (done) {
  177. d.on("data", function(chunk){
  178. result = Buffer.concat([ result, chunk ]);
  179. });
  180. d.on("end", function(){
  181. result.toString("utf-8").should.eql('慶');
  182. done();
  183. });
  184. // write '慶' in euc-jp split across chunks
  185. d.write(Buffer.from([0xB7]));
  186. d.write(Buffer.from([0xC4]));
  187. d.end();
  188. })
  189. })
  190. describe('with encoding = gb18030', function() {
  191. var d,
  192. result = Buffer.allocUnsafe(0);
  193. before(function(done) {
  194. d = decoder('gb18030');
  195. done();
  196. });
  197. it('reassembles split multibyte characters', function (done) {
  198. d.on("data", function(chunk){
  199. result = Buffer.concat([ result, chunk ]);
  200. });
  201. d.on("end", function(){
  202. result.toString("utf-8").should.eql('慶');
  203. done();
  204. });
  205. // write '慶' in gb18030 split across chunks
  206. d.write(Buffer.from([0x91]));
  207. d.write(Buffer.from([0x63]));
  208. d.end();
  209. })
  210. })
  211. })
  212. })