From 63d4d8283509fd42505b65ddc2cb38fe546dffc0 Mon Sep 17 00:00:00 2001 From: Shevek Date: Tue, 4 Sep 2018 13:47:47 -0700 Subject: LexerSource: Support text round-trip of lost and lonely unicode. --- src/main/java/org/anarres/cpp/LexerSource.java | 17 ++++++++++++----- src/test/java/org/anarres/cpp/LexerSourceTest.java | 5 +++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/anarres/cpp/LexerSource.java b/src/main/java/org/anarres/cpp/LexerSource.java index cf4296f..33268f8 100644 --- a/src/main/java/org/anarres/cpp/LexerSource.java +++ b/src/main/java/org/anarres/cpp/LexerSource.java @@ -315,10 +315,11 @@ public class LexerSource extends Source { /** * Lexes an escaped character, appends the lexed escape sequence to 'text' and returns the parsed character value. + * * @param text The buffer to which the literal escape sequence is appended. * @return The new parsed character value. * @throws IOException if it goes badly wrong. - * @throws LexerException if it goes wrong. + * @throws LexerException if it goes wrong. */ private int escape(StringBuilder text) throws IOException, @@ -425,7 +426,7 @@ public class LexerSource extends Source { int e = read(); if (e != '\'') { // error("Illegal character constant"); - /* We consume up to the next ' or the rest of the line. */ + /* We consume up to the next ' or the rest of the line. */ for (;;) { if (isLineSeparator(e)) { unread(e); @@ -741,8 +742,7 @@ public class LexerSource extends Source { text.append((char) c); for (;;) { d = read(); - if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */ - + if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */ break; if (Character.isWhitespace(d)) text.append((char) d); @@ -978,7 +978,14 @@ public class LexerSource extends Source { } else if (Character.isJavaIdentifierStart(c)) { tok = identifier(c); } else { - tok = new Token(c); + String text = TokenType.getTokenText(c); + if (text == null) { + if ((c >>> 16) == 0) // Character.isBmpCodePoint() is new in 1.7 + text = Character.toString((char) c); + else + text = new String(Character.toChars(c)); + } + tok = new Token(c, text); } } diff --git a/src/test/java/org/anarres/cpp/LexerSourceTest.java b/src/test/java/org/anarres/cpp/LexerSourceTest.java index 96ec4a3..38d0a6f 100644 --- a/src/test/java/org/anarres/cpp/LexerSourceTest.java +++ b/src/test/java/org/anarres/cpp/LexerSourceTest.java @@ -137,4 +137,9 @@ public class LexerSourceTest { testLexerSource("5 /*", false, NUMBER, WHITESPACE, INVALID); // Bug #15 testLexerSource("5 //", false, NUMBER, WHITESPACE, CPPCOMMENT); } + + @Test + public void testUnicode()throws Exception{ + testLexerSource("foo \u2018bar\u2019 baz", true, IDENTIFIER, WHITESPACE, 8216, IDENTIFIER, 8217, WHITESPACE, IDENTIFIER); + } } -- cgit v1.2.3