Strings and Regular Expressions

sora.lee 2018. 12. 25. 18:16

https://leanpub.com/understandinges6/read#leanpub-auto-better-unicode-support 을 정리한 내용입니다.

향상된 유니코드 지원

ECMAScript 5 까지는 16-bit 문자 인코딩 방식(UTF-16)을 사용했다.

length와 charAt()은 16-bit 단위로 계산되었다.

그러나 Unicode는 16 bits만으로 표현되지 않는다.

var text = "𠮷";
console.log(text.length);           // 2
console.log(/^.$/.test(text));      // false
console.log(text.charAt(0));        // ""
console.log(text.charAt(1));        // ""
console.log(text.charCodeAt(0));    // 55362
console.log(text.charCodeAt(1));    // 57271

ECMAScript 6 에는 이러한 문제를 해결하기 위한 기능들이 추가되었다.

codePointAt()

var text = "𠮷a";
console.log(text.charCodeAt(0));    // 55362
console.log(text.charCodeAt(1));    // 57271
console.log(text.charCodeAt(2));    // 97
console.log(text.codePointAt(0));   // 134071
console.log(text.codePointAt(1));   // 57271
console.log(text.codePointAt(2));   // 97

어떤 문자가 16 bits로 표현되는지, 32 bits로 표현되는지 확인하기 위해서 codePointAt()을 사용할 수 있다.

function is32Bit(c) {
    return c.codePointAt(0) > 0xFFFF;
}
console.log(is32Bit("𠮷"));         // true
console.log(is32Bit("a"));          // false

String.fromCodePoint()

String.fromCodePoint()는 주어진 코드 포인트에 대한 문자를 반환한다.

console.log(String.fromCodePoint(134071));  // "𠮷"

normalize()

Unicode에서는 서로 다른 문자열이 같다고 간주되는 경우가 존재한다.

Canonical equivalence: 두 개의 코드 포인트 시퀀스를 상호교환할 수 있다. 예를 들어 é는 e와 ◌́의 조합으로 교환 가능하다.
Compatibility: 예를 들어 µ은 U+00B5 대신 U+03BC을 사용한다.

normalize()를 사용해서 문자열을 비교하면 canonical equivalence를 확인할 수 있다.

a = String.fromCodePoint(101); // e
b = String.fromCodePoint(769); // ◌́
c = String.fromCodePoint(233); // é
d = a + b;
console.log(c === d); // false
console.log(c.normalize() === d.normalize()); // true

정규표현식의 u 플래그

정규표현식은 16-bit 단위로 문자를 매칭한다.

u 플래그를 사용하면 유니코드를 매칭할 수 있다.

var text = "𠮷";
console.log(text.length);           // 2
console.log(/^.$/.test(text));      // false
console.log(/^.$/u.test(text));     // true

ECMAScript 6은 코드 포인트 개수를 반환하는 메서드를 추가하지 않았다.

대신 다음과 같이 계산할 수 있다.

function codePointLength(text) {
    var result = text.match(/[\s\S]/gu); // 공백 또는 공백이 아닌 charactor를 매치한다. 매치된 charactor의 list가 반환된다.
    return result ? result.length : 0;
}
console.log(codePointLength("abc"));    // 3
console.log(codePointLength("𠮷bc"));   // 3

챕터 8에서 소개하는 string iterator를 사용해서 문자열의 길이를 계산할 수도 있다.

문자열의 길이를 계산하는 것은 긴 문자열에 대해서는 빠르지 않으므로, 사용을 최소화하는 것이 좋다.

정규표현식의 기타 변경사항

y 플래그

var text = "hello1 hello2 hi hello3",
    pattern = /hello\d\s?/,
    globalPattern = /hello\d\s?/g,
    stickyPattern = /hello\d\s?/y;

console.log(pattern.exec(text)[0]); // hello1
console.log(pattern.exec(text)[0]); // hello1
console.log(pattern.exec(text)[0]); // hello1

console.log(globalPattern.exec(text)[0]); // hello1
console.log(globalPattern.exec(text)[0]); // hello2
console.log(globalPattern.exec(text)[0]); // hello3

console.log(stickyPattern.exec(text)[0]); // hello1
console.log(stickyPattern.exec(text)[0]); // hello2
console.log(stickyPattern.exec(text)); // null

Template literals

ECMAScript 5의 부족한 점을 보완하기 위해서 ` (backtick) 으로 구분된 문자열이 추가되었다.

여러 줄의 문자열

ECMAScript 5에서 여러 줄의 문자열을 만드는 방법

var message = "Multiline \n\string";
var message = ["Multiline ", "string"].join("\n");

ECMAScript 6에서 여러 줄의 문자열을 만드는 방법

let message = `Multiline
string`;

Tag

let name = 'John',
age = 32,
msg = myTag`And ${name} is in his/her ${age}.`;

function myTag(strings, name, age) {
    if (age < 10) {
      return strings[0] + name + 'is a child.'
    }
    else {
      return strings[0] + name + strings[1] + `${parseInt(age/10)}0s`; // And John is in his/her 30s.
    }
}
console.log(msg);

let message1 = `Multiline\nstring`,
    message2 = String.raw`Multiline\nstring`;

console.log(message1);          // "Multiline
                                //  string"
console.log(message2);          // "Multiline\\nstring"

저작자표시 비영리 변경금지 (새창열림)