LexerSource: Support text round-trip of lost and lonely unicode.

author: Shevek <[email protected]> 2018-09-04 13:47:47 -0700
committer: Shevek <[email protected]> 2018-09-04 13:47:47 -0700
commit: 63d4d8283509fd42505b65ddc2cb38fe546dffc0 (patch)
tree: dcbcc48c879880d5521e19e1377357d78cd1397d
parent: f85b12881fd38153178b425f30ea679d4ef7f4e2 (diff)
2 files changed, 17 insertions, 5 deletions
diff --git a/src/main/java/org/anarres/cpp/LexerSource.java b/src/main/java/org/anarres/cpp/LexerSource.java
index cf4296f..33268f8 100644
--- a/src/main/java/org/anarres/cpp/LexerSource.java
+++ b/src/main/java/org/anarres/cpp/LexerSource.java
@@ -315,10 +315,11 @@ public class LexerSource extends Source {
 
     /**
      * Lexes an escaped character, appends the lexed escape sequence to 'text' and returns the parsed character value.
+     *
      * @param text The buffer to which the literal escape sequence is appended.
      * @return The new parsed character value.
      * @throws IOException if it goes badly wrong.
-     * @throws LexerException  if it goes wrong.
+     * @throws LexerException if it goes wrong.
      */
     private int escape(StringBuilder text)
             throws IOException,
@@ -425,7 +426,7 @@ public class LexerSource extends Source {
         int e = read();
         if (e != '\'') {
             // error("Illegal character constant");
-			/* We consume up to the next ' or the rest of the line. */
+            /* We consume up to the next ' or the rest of the line. */
             for (;;) {
                 if (isLineSeparator(e)) {
                     unread(e);
@@ -741,8 +742,7 @@ public class LexerSource extends Source {
         text.append((char) c);
         for (;;) {
             d = read();
-            if (ppvalid && isLineSeparator(d))	/* XXX Ugly. */
-
+            if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */
                 break;
             if (Character.isWhitespace(d))
                 text.append((char) d);
@@ -978,7 +978,14 @@ public class LexerSource extends Source {
             } else if (Character.isJavaIdentifierStart(c)) {
                 tok = identifier(c);
             } else {
-                tok = new Token(c);
+                String text = TokenType.getTokenText(c);
+                if (text == null) {
+                    if ((c >>> 16) == 0)    // Character.isBmpCodePoint() is new in 1.7
+                        text = Character.toString((char) c);
+                    else
+                        text = new String(Character.toChars(c));
+                }
+                tok = new Token(c, text);
             }
         }
 
diff --git a/src/test/java/org/anarres/cpp/LexerSourceTest.java b/src/test/java/org/anarres/cpp/LexerSourceTest.java
index 96ec4a3..38d0a6f 100644
--- a/src/test/java/org/anarres/cpp/LexerSourceTest.java
+++ b/src/test/java/org/anarres/cpp/LexerSourceTest.java
@@ -137,4 +137,9 @@ public class LexerSourceTest {
         testLexerSource("5 /*", false, NUMBER, WHITESPACE, INVALID);    // Bug #15
         testLexerSource("5 //", false, NUMBER, WHITESPACE, CPPCOMMENT);
     }
+
+    @Test
+    public void testUnicode()throws Exception{
+        testLexerSource("foo \u2018bar\u2019 baz", true, IDENTIFIER, WHITESPACE, 8216, IDENTIFIER, 8217, WHITESPACE, IDENTIFIER);
+    }
 }
author	Shevek <[email protected]>	2018-09-04 13:47:47 -0700
committer	Shevek <[email protected]>	2018-09-04 13:47:47 -0700
commit	63d4d8283509fd42505b65ddc2cb38fe546dffc0 (patch)
tree	dcbcc48c879880d5521e19e1377357d78cd1397d
parent	f85b12881fd38153178b425f30ea679d4ef7f4e2 (diff)