Upgrade to Pro — share decks privately, control downloads, hide ads and more …

JavaScript ♥ Unicode

JavaScript ♥ Unicode

Mathias Bynens

October 19, 2013
Tweet

More Decks by Mathias Bynens

Other Decks in Programming

Transcript

  1. (0x10FFFF + 1) code points ! ↓ ! 17 planes

    (0xFFFF + 1) code points each
  2. Hexadecimal escape sequences >> '\x41\x42\x43' 'ABC' >> '\x61\x62\x63' 'abc' >>

    '\xA9 Caf\xE9 XYZ' '© Café XYZ' ! can be used for U+0000 → U+00FF
  3. Unicode escape sequences >> '\u0041\u0042\u0043' 'ABC' >> 'I \u2661 JavaScript!'

    'I ὑ JavaScript!' ! can be used for U+0000 → U+FFFF
  4. !

  5. Unicode code point escapes >> '\u{41}\u{42}\u{43}' 'ABC' >> '\u{1F4A9}' '!'

    // U+1F4A9 ! can be used for U+000000 → U+10FFFF ES6
  6. Surrogate pairs % // for astral code points (> 0xFFFF)

    function getSurrogates(codePoint) { var high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800; var low = (codePoint - 0x10000) % 0x400 + 0xDC00; return [ high, low ]; } ! function getCodePoint(high, low) { var codePoint = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000; return codePoint; } ! >> getSurrogates(0x1F4A9); // U+1F4A9 is # [ 0xD83D, 0xDCA9 ] >> getCodePoint(0xD83D, 0xDCA9); 0x1F4A9 mths.be/bed
  7. JavaScript string length >> 'A'.length // U+0041 1 >> 'A'

    == '\u0041' true >> 'B'.length // U+0042 1 >> 'B' == '\u0042' true
  8. String length ≠ char count >> '!'.length // U+1D400 2

    >> '!' == '\uD835\uDC00' true >> '"'.length // U+1D401 2 >> '"' == '\uD835\uDC01' true
  9. String length ≠ char count >> '!'.length // U+1F4A9 2

    >> '!' == '\uD83D\uDCA9' true insert obligatory “number two” joke here
  10. String character count function countSymbols(string) { return punycode.ucs2.decode(string).length; } !

    >> countSymbols('A') // U+0041 1 >> countSymbols('!') // U+1D400 1 >> countSymbols('#') // U+1F4A9 1 mths.be/punycode
  11. String character count function countSymbols(string) { return Array.from(string).length; } !

    >> countSymbols('A') // U+0041 1 >> countSymbols('!') // U+1D400 1 >> countSymbols('!') // U+1F4A9 1 ES6
  12. If we’re being pedantic… // it’s actually even more complicated:

    ! >> 'mañana' == 'mañana' false >> 'ma\xF1ana' == 'man\u0303ana' false >> 'ma\xF1ana'.length 6 >> 'man\u0303ana'.length 7
  13. function countSymbolsPedantically(string) { // Unicode Normalization, NFC form: var normalized

    = string.normalize('NFC'); // Account for astral symbols / surrogates: return Array.from(normalized).length; } ! >> countSymbolsPedantically('mañana') // U+00F1 6 >> countSymbolsPedantically('mañana') // U+006E + U+0303 6 Unicode normalization git.io/unorm ES6
  14. Perfect? >> var zalgo = 'H ̹̙̦̮͉̩̗̗ ͧ̇̏̊̾ Eͨ͆͒̆ͮ̃ ͏̷̮̣̫̤̣

    ̵̞̹̻ ̀̉̓ͬ͑͡ ͅ Cͯ̂͐ ͏̨̛͔̦̟͈̻ O ̜͎͍͙͚̬̝̣ ̽ͮ͐͗̀ͤ̍̀ ͢ M ̴̡̲̭͍͇̼̟̯̦ ̉̒͠ Ḛ̛̙̞̪̗ ͥ ͤͩ̾͑̔͐ ͅ Ṯ̴̷̷̗̼͍ ̿̿̓̽͐ H ̙̙ ̔̄ ͜ ';
  15. Perfect? Nope. → can be ‘fixed’ using epic regex-fu >>

    var zalgo = 'H ̹̙̦̮͉̩̗̗ ͧ̇̏̊̾ Eͨ͆͒̆ͮ̃ ͏̷̮̣̫̤̣ ̵̞̹̻ ̀̉̓ͬ͑͡ ͅ Cͯ̂͐ ͏̨̛͔̦̟͈̻ O ̜͎͍͙͚̬̝̣ ̽ͮ͐͗̀ͤ̍̀ ͢ M ̴̡̲̭͍͇̼̟̯̦ ̉̒͠ Ḛ̛̙̞̪̗ ͥ ͤͩ̾͑̔͐ ͅ Ṯ̴̷̷̗̼͍ ̿̿̓̽͐ H ̙̙ ̔̄ ͜ '; ! >> countSymbolsPedantically(zalgo) 116 // not 9
  16. Reversing a string in JavaScript // naive solution function reverse(string)

    { return string.split('').reverse().join(''); }
  17. Reversing a string in JavaScript // naive solution function reverse(string)

    { return string.split('').reverse().join(''); } ! >> reverse('abc') 'cba'
  18. Reversing a string in JavaScript // naive solution function reverse(string)

    { return string.split('').reverse().join(''); } ! >> reverse('abc') 'cba' >> reverse('mañana') // U+00F1 'anañam'
  19. Reversing a string in JavaScript // naive solution function reverse(string)

    { return string.split('').reverse().join(''); } ! >> reverse('abc') 'cba' >> reverse('mañana') // U+00F1 'anañam' >> reverse('mañana') // U+006E + U+0303 'anãnam'
  20. Reversing a string in JavaScript // naive solution function reverse(string)

    { return string.split('').reverse().join(''); } ! >> reverse('abc') 'cba' >> reverse('mañana') // U+00F1 'anañam' >> reverse('mañana') // U+006E + U+0303 'anãnam' >> reverse('!') // U+1F4A9 '��' '\uDCA9\uD83D' // the surrogate pair for !, in the wrong order
  21. “I put my thang down, flip it, and reverse it”

    — Missy ‘Misdemeanor’ Elliot, 2002
  22. Reversing a string in JavaScript // Using the Esrever library

    var reverse = esrever.reverse; ! >> reverse('abc') 'cba' >> reverse('mañana') // U+00F1 'anañam' >> reverse('mañana') // U+006E + U+0303 'anañam' >> reverse('!') // U+1F4A9 '!' mths.be/esrever
  23. String.fromCharCode() >> String.fromCharCode(0x0041) // U+0041 'A' // U+0041 >> String.fromCharCode(0x1F4A9)

    // U+1F4A9 '!' // U+F4A9 ! only works as you’d expect for U+0000 → U+FFFF
  24. String.fromCharCode() → use surrogate pairs for astral symbols: ! >>

    String.fromCharCode(0xD83D, 0xDCA9) '!' // U+1F4A9 !
  25. String.fromCharCode() → use surrogate pairs for astral symbols: ! >>

    String.fromCharCode(0xD83D, 0xDCA9) '!' // U+1F4A9 ! → or just use Punycode.js: ! >> punycode.ucs2.encode([ 0x1F4A9 ]) '!' // U+1F4A9
  26. Iterate over all symbols in a string function getSymbols(string) {

    var length = string.length; var index = -1; var output = []; var character; var charCode; while (++index < length) { character = string.charAt(index); charCode = character.charCodeAt(0); if (charCode >= 0xD800 && charCode <= 0xDBFF) { // note: this doesn’t account for lone high surrogates output.push(character + string.charAt(++index)); } else { output.push(character); } } return output; } ! var symbols = getSymbols('! '); symbols.forEach(function(symbol) { assert(symbol == '! '); });
  27. Iterate over all symbols in a string for (let symbol

    of '!') { assert(symbol == '!'); } ES6
  28. Match any Unicode symbol >> /^.$/.test('!') false // doesn’t match

    line breaks, either ! >> /^[\s\S]$/.test('!') false // matches line breaks, but still doesn’t match whole astral symbols
  29. Match any Unicode symbol >> /^.$/.test('!') false // doesn’t match

    line breaks, either ! >> /^[\s\S]$/.test('!') false // matches line breaks, but still doesn’t match whole astral symbols ! >> /^[\0-\uD7FF\uDC00-\uFFFF]|[\uD800- \uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF] $/.test('!') true // wtf
  30. >> regenerate().addRange(0x0, 0x10FFFF).toString() '[\0-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800- \uDBFF]' >> regenerate() …… .addRange(0x000000, 0x10FFFF)

    // add all Unicode code points …… .removeRange('A', 'z') // remove all symbols from `A` to `z` mths.be/regenerate Create Unicode-aware regular expressions
  31. >> regenerate().addRange(0x0, 0x10FFFF).toString() '[\0-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800- \uDBFF]' >> regenerate() …… .addRange(0x000000, 0x10FFFF)

    // add all Unicode code points …… .removeRange('A', 'z') // remove all symbols from `A` to `z` …… .remove('#') // remove U+1F4A9 PILE OF POO mths.be/regenerate Create Unicode-aware regular expressions
  32. >> regenerate().addRange(0x0, 0x10FFFF).toString() '[\0-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800- \uDBFF]' >> regenerate() …… .addRange(0x000000, 0x10FFFF)

    // add all Unicode code points …… .removeRange('A', 'z') // remove all symbols from `A` to `z` …… .remove('#') // remove U+1F4A9 PILE OF POO …… .toString(); mths.be/regenerate Create Unicode-aware regular expressions
  33. >> regenerate().addRange(0x0, 0x10FFFF).toString() '[\0-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800- \uDBFF]' >> regenerate() …… .addRange(0x000000, 0x10FFFF)

    // add all Unicode code points …… .removeRange('A', 'z') // remove all symbols from `A` to `z` …… .remove('#') // remove U+1F4A9 PILE OF POO …… .toString(); '[\0-\x1F\x21-\x40\x7B-\uD7FF\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00- \uDFFF]|[\uD800-\uDBFF]' mths.be/regenerate Create Unicode-aware regular expressions
  34. >> var regenerate = require('regenerate'); >> var symbols = require('unicode-6.3.0/scripts/Greek/symbols');

    >> var set = regenerate(symbols); >> set.toString(); '[\u0370-\u0373\u0375-\u0377\u037A-\u037D\u0384\u0386\u0388-\u038A\u038C \u038E-\u03A1\u03A3-\u03E1\u03F0-\u03FF\u1D26-\u1D2A\u1D5D-\u1D61\u1D66- \u1D6A\u1DBF\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50- \u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FC4\u1FC6- \u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEF\u1FF2-\u1FF4\u1FF6-\u1FFE\u2126]| \uD800[\uDD40-\uDD8A]|\uD834[\uDE00-\uDE45]' mths.be/regenerate mths.be/node-unicode-data Create Unicode-aware regular expressions
  35. >> var regenerate = require('regenerate'); >> var symbols = require('unicode-7.0.0/scripts/Greek/symbols');

    >> var set = regenerate(symbols); >> set.toString(); '[\u0370-\u0373\u0375-\u0377\u037A-\u037D\u037F\u0384\u0386\u0388-\u038A \u038C\u038E-\u03A1\u03A3-\u03E1\u03F0-\u03FF\u1D26-\u1D2A\u1D5D- \u1D61\u1D66-\u1D6A\u1DBF\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D \u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FC4\u1FC6- \u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEF\u1FF2-\u1FF4\u1FF6-\u1FFE\u2126\uAB65]| \uD800[\uDD40-\uDD8C\uDDA0]|\uD834[\uDE00-\uDE45]' mths.be/regenerate mths.be/node-unicode-data Create Unicode-aware regular expressions
  36. Regex character classes >> /[a-c]/ // matches: // U+0061 LATIN

    SMALL LETTER A // U+0062 LATIN SMALL LETTER B // U+0063 LATIN SMALL LETTER C >> /^[a-c]$/.test('a') true >> /^[a-c]$/.test('b') true >> /^[a-c]$/.test('c') true
  37. >> /[!-"]/ // matches: // U+1F4A9 PILE OF POO //

    U+1F4AA FLEXED BICEPS // U+1F4AB DIZZY SYMBOL >> /^[!-"]$/.test('!') true >> /^[!-"]$/.test('#') true >> /^[!-"]$/.test('"') true Regex character classes
  38. >> /[!-"]/ // matches: // U+1F4A9 PILE OF POO //

    U+1F4AA FLEXED BICEPS // U+1F4AB DIZZY SYMBOL >> /^[!-"]$/.test('!') true >> /^[!-"]$/.test('#') true >> /^[!-"]$/.test('"') true Regex character classes ✘
  39. Regex character classes >> /[!-"]/ SyntaxError: Invalid regular expression: Range

    out of order in character class >> /[\uD83D\uDCA9-\uD83D\uDCAB]/
  40. Regex character classes >> /[!-"]/ SyntaxError: Invalid regular expression: Range

    out of order in character class >> /[\uD83D\uDCA9-\uD83D\uDCAB]/
  41. Regex character classes ES6 ✔ >> /[!-"]/u // matches: //

    U+1F4A9 PILE OF POO // U+1F4AA FLEXED BICEPS // U+1F4AB DIZZY SYMBOL >> /^[!-"]$/u.test('!') true >> /^[!-"]$/u.test('#') true >> /^[!-"]$/u.test('"') true