shim.js 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. // Taken from: https://github.com/walling/unorm/blob/master/lib/unorm.js
  2. /*
  3. * UnicodeNormalizer 1.0.0
  4. * Copyright (c) 2008 Matsuza
  5. * Dual licensed under the MIT (MIT-LICENSE.txt) and
  6. * GPL (GPL-LICENSE.txt) licenses.
  7. * $Date: 2008-06-05 16:44:17 +0200 (Thu, 05 Jun 2008) $
  8. * $Rev: 13309 $
  9. */
  10. 'use strict';
  11. var primitiveSet = require('../../../object/primitive-set')
  12. , validValue = require('../../../object/valid-value')
  13. , data = require('./_data')
  14. , floor = Math.floor
  15. , forms = primitiveSet('NFC', 'NFD', 'NFKC', 'NFKD')
  16. , DEFAULT_FEATURE = [null, 0, {}], CACHE_THRESHOLD = 10, SBase = 0xAC00
  17. , LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21
  18. , TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount
  19. , UChar, cache = {}, cacheCounter = [], i, fromCache, fromData, fromCpOnly
  20. , fromRuleBasedJamo, fromCpFilter, strategies, UCharIterator
  21. , RecursDecompIterator, DecompIterator, CompIterator, createIterator
  22. , normalize;
  23. UChar = function (cp, feature) {
  24. this.codepoint = cp;
  25. this.feature = feature;
  26. };
  27. // Strategies
  28. for (i = 0; i <= 0xFF; ++i) cacheCounter[i] = 0;
  29. fromCache = function (next, cp, needFeature) {
  30. var ret = cache[cp];
  31. if (!ret) {
  32. ret = next(cp, needFeature);
  33. if (!!ret.feature && ++cacheCounter[(cp >> 8) & 0xFF] > CACHE_THRESHOLD) {
  34. cache[cp] = ret;
  35. }
  36. }
  37. return ret;
  38. };
  39. fromData = function (next, cp, needFeature) {
  40. var hash = cp & 0xFF00, dunit = UChar.udata[hash] || {}, f = dunit[cp];
  41. return f ? new UChar(cp, f) : new UChar(cp, DEFAULT_FEATURE);
  42. };
  43. fromCpOnly = function (next, cp, needFeature) {
  44. return !!needFeature ? next(cp, needFeature) : new UChar(cp, null);
  45. };
  46. fromRuleBasedJamo = function (next, cp, needFeature) {
  47. var c, base, i, arr, SIndex, TIndex, feature, j;
  48. if (cp < LBase || (LBase + LCount <= cp && cp < SBase) ||
  49. (SBase + SCount < cp)) {
  50. return next(cp, needFeature);
  51. }
  52. if (LBase <= cp && cp < LBase + LCount) {
  53. c = {};
  54. base = (cp - LBase) * VCount;
  55. for (i = 0; i < VCount; ++i) {
  56. c[VBase + i] = SBase + TCount * (i + base);
  57. }
  58. arr = new Array(3);
  59. arr[2] = c;
  60. return new UChar(cp, arr);
  61. }
  62. SIndex = cp - SBase;
  63. TIndex = SIndex % TCount;
  64. feature = [];
  65. if (TIndex !== 0) {
  66. feature[0] = [SBase + SIndex - TIndex, TBase + TIndex];
  67. } else {
  68. feature[0] = [LBase + floor(SIndex / NCount), VBase +
  69. floor((SIndex % NCount) / TCount)];
  70. feature[2] = {};
  71. for (j = 1; j < TCount; ++j) {
  72. feature[2][TBase + j] = cp + j;
  73. }
  74. }
  75. return new UChar(cp, feature);
  76. };
  77. fromCpFilter = function (next, cp, needFeature) {
  78. return (cp < 60) || ((13311 < cp) && (cp < 42607))
  79. ? new UChar(cp, DEFAULT_FEATURE) : next(cp, needFeature);
  80. };
  81. strategies = [fromCpFilter, fromCache, fromCpOnly, fromRuleBasedJamo, fromData];
  82. UChar.fromCharCode = strategies.reduceRight(function (next, strategy) {
  83. return function (cp, needFeature) { return strategy(next, cp, needFeature); };
  84. }, null);
  85. UChar.isHighSurrogate = function (cp) { return cp >= 0xD800 && cp <= 0xDBFF; };
  86. UChar.isLowSurrogate = function (cp) { return cp >= 0xDC00 && cp <= 0xDFFF; };
  87. UChar.prototype.prepFeature = function () {
  88. if (!this.feature) {
  89. this.feature = UChar.fromCharCode(this.codepoint, true).feature;
  90. }
  91. };
  92. UChar.prototype.toString = function () {
  93. var x;
  94. if (this.codepoint < 0x10000) return String.fromCharCode(this.codepoint);
  95. x = this.codepoint - 0x10000;
  96. return String.fromCharCode(floor(x / 0x400) + 0xD800, x % 0x400 + 0xDC00);
  97. };
  98. UChar.prototype.getDecomp = function () {
  99. this.prepFeature();
  100. return this.feature[0] || null;
  101. };
  102. UChar.prototype.isCompatibility = function () {
  103. this.prepFeature();
  104. return !!this.feature[1] && (this.feature[1] & (1 << 8));
  105. };
  106. UChar.prototype.isExclude = function () {
  107. this.prepFeature();
  108. return !!this.feature[1] && (this.feature[1] & (1 << 9));
  109. };
  110. UChar.prototype.getCanonicalClass = function () {
  111. this.prepFeature();
  112. return !!this.feature[1] ? (this.feature[1] & 0xff) : 0;
  113. };
  114. UChar.prototype.getComposite = function (following) {
  115. var cp;
  116. this.prepFeature();
  117. if (!this.feature[2]) return null;
  118. cp = this.feature[2][following.codepoint];
  119. return cp ? UChar.fromCharCode(cp) : null;
  120. };
  121. UCharIterator = function (str) {
  122. this.str = str;
  123. this.cursor = 0;
  124. };
  125. UCharIterator.prototype.next = function () {
  126. if (!!this.str && this.cursor < this.str.length) {
  127. var cp = this.str.charCodeAt(this.cursor++), d;
  128. if (UChar.isHighSurrogate(cp) && this.cursor < this.str.length &&
  129. UChar.isLowSurrogate((d = this.str.charCodeAt(this.cursor)))) {
  130. cp = (cp - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000;
  131. ++this.cursor;
  132. }
  133. return UChar.fromCharCode(cp);
  134. }
  135. this.str = null;
  136. return null;
  137. };
  138. RecursDecompIterator = function (it, cano) {
  139. this.it = it;
  140. this.canonical = cano;
  141. this.resBuf = [];
  142. };
  143. RecursDecompIterator.prototype.next = function () {
  144. var recursiveDecomp, uchar;
  145. recursiveDecomp = function (cano, uchar) {
  146. var decomp = uchar.getDecomp(), ret, i, a, j;
  147. if (!!decomp && !(cano && uchar.isCompatibility())) {
  148. ret = [];
  149. for (i = 0; i < decomp.length; ++i) {
  150. a = recursiveDecomp(cano, UChar.fromCharCode(decomp[i]));
  151. //ret.concat(a); //<-why does not this work?
  152. //following block is a workaround.
  153. for (j = 0; j < a.length; ++j) ret.push(a[j]);
  154. }
  155. return ret;
  156. }
  157. return [uchar];
  158. };
  159. if (this.resBuf.length === 0) {
  160. uchar = this.it.next();
  161. if (!uchar) return null;
  162. this.resBuf = recursiveDecomp(this.canonical, uchar);
  163. }
  164. return this.resBuf.shift();
  165. };
  166. DecompIterator = function (it) {
  167. this.it = it;
  168. this.resBuf = [];
  169. };
  170. DecompIterator.prototype.next = function () {
  171. var cc, uchar, inspt, uchar2, cc2;
  172. if (this.resBuf.length === 0) {
  173. do {
  174. uchar = this.it.next();
  175. if (!uchar) break;
  176. cc = uchar.getCanonicalClass();
  177. inspt = this.resBuf.length;
  178. if (cc !== 0) {
  179. for (inspt; inspt > 0; --inspt) {
  180. uchar2 = this.resBuf[inspt - 1];
  181. cc2 = uchar2.getCanonicalClass();
  182. if (cc2 <= cc) break;
  183. }
  184. }
  185. this.resBuf.splice(inspt, 0, uchar);
  186. } while (cc !== 0);
  187. }
  188. return this.resBuf.shift();
  189. };
  190. CompIterator = function (it) {
  191. this.it = it;
  192. this.procBuf = [];
  193. this.resBuf = [];
  194. this.lastClass = null;
  195. };
  196. CompIterator.prototype.next = function () {
  197. var uchar, starter, composite, cc;
  198. while (this.resBuf.length === 0) {
  199. uchar = this.it.next();
  200. if (!uchar) {
  201. this.resBuf = this.procBuf;
  202. this.procBuf = [];
  203. break;
  204. }
  205. if (this.procBuf.length === 0) {
  206. this.lastClass = uchar.getCanonicalClass();
  207. this.procBuf.push(uchar);
  208. } else {
  209. starter = this.procBuf[0];
  210. composite = starter.getComposite(uchar);
  211. cc = uchar.getCanonicalClass();
  212. if (!!composite && (this.lastClass < cc || this.lastClass === 0)) {
  213. this.procBuf[0] = composite;
  214. } else {
  215. if (cc === 0) {
  216. this.resBuf = this.procBuf;
  217. this.procBuf = [];
  218. }
  219. this.lastClass = cc;
  220. this.procBuf.push(uchar);
  221. }
  222. }
  223. }
  224. return this.resBuf.shift();
  225. };
  226. createIterator = function (mode, str) {
  227. switch (mode) {
  228. case "NFD":
  229. return new DecompIterator(
  230. new RecursDecompIterator(new UCharIterator(str), true)
  231. );
  232. case "NFKD":
  233. return new DecompIterator(
  234. new RecursDecompIterator(new UCharIterator(str), false)
  235. );
  236. case "NFC":
  237. return new CompIterator(new DecompIterator(
  238. new RecursDecompIterator(new UCharIterator(str), true)
  239. ));
  240. case "NFKC":
  241. return new CompIterator(new DecompIterator(
  242. new RecursDecompIterator(new UCharIterator(str), false)
  243. ));
  244. }
  245. throw mode + " is invalid";
  246. };
  247. normalize = function (mode, str) {
  248. var it = createIterator(mode, str), ret = "", uchar;
  249. while (!!(uchar = it.next())) ret += uchar.toString();
  250. return ret;
  251. };
  252. /* Unicode data */
  253. UChar.udata = data;
  254. module.exports = function (/*form*/) {
  255. var str = String(validValue(this)), form = arguments[0];
  256. if (form === undefined) form = 'NFC';
  257. else form = String(form);
  258. if (!forms[form]) throw new RangeError('Invalid normalization form: ' + form);
  259. return normalize(form, str);
  260. };