aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShevek <[email protected]>2018-09-04 13:47:47 -0700
committerShevek <[email protected]>2018-09-04 13:47:47 -0700
commit63d4d8283509fd42505b65ddc2cb38fe546dffc0 (patch)
treedcbcc48c879880d5521e19e1377357d78cd1397d
parentf85b12881fd38153178b425f30ea679d4ef7f4e2 (diff)
LexerSource: Support text round-trip of lost and lonely unicode.
-rw-r--r--src/main/java/org/anarres/cpp/LexerSource.java17
-rw-r--r--src/test/java/org/anarres/cpp/LexerSourceTest.java5
2 files changed, 17 insertions, 5 deletions
diff --git a/src/main/java/org/anarres/cpp/LexerSource.java b/src/main/java/org/anarres/cpp/LexerSource.java
index cf4296f..33268f8 100644
--- a/src/main/java/org/anarres/cpp/LexerSource.java
+++ b/src/main/java/org/anarres/cpp/LexerSource.java
@@ -315,10 +315,11 @@ public class LexerSource extends Source {
/**
* Lexes an escaped character, appends the lexed escape sequence to 'text' and returns the parsed character value.
+ *
* @param text The buffer to which the literal escape sequence is appended.
* @return The new parsed character value.
* @throws IOException if it goes badly wrong.
- * @throws LexerException if it goes wrong.
+ * @throws LexerException if it goes wrong.
*/
private int escape(StringBuilder text)
throws IOException,
@@ -425,7 +426,7 @@ public class LexerSource extends Source {
int e = read();
if (e != '\'') {
// error("Illegal character constant");
- /* We consume up to the next ' or the rest of the line. */
+ /* We consume up to the next ' or the rest of the line. */
for (;;) {
if (isLineSeparator(e)) {
unread(e);
@@ -741,8 +742,7 @@ public class LexerSource extends Source {
text.append((char) c);
for (;;) {
d = read();
- if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */
-
+ if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */
break;
if (Character.isWhitespace(d))
text.append((char) d);
@@ -978,7 +978,14 @@ public class LexerSource extends Source {
} else if (Character.isJavaIdentifierStart(c)) {
tok = identifier(c);
} else {
- tok = new Token(c);
+ String text = TokenType.getTokenText(c);
+ if (text == null) {
+ if ((c >>> 16) == 0) // Character.isBmpCodePoint() is new in 1.7
+ text = Character.toString((char) c);
+ else
+ text = new String(Character.toChars(c));
+ }
+ tok = new Token(c, text);
}
}
diff --git a/src/test/java/org/anarres/cpp/LexerSourceTest.java b/src/test/java/org/anarres/cpp/LexerSourceTest.java
index 96ec4a3..38d0a6f 100644
--- a/src/test/java/org/anarres/cpp/LexerSourceTest.java
+++ b/src/test/java/org/anarres/cpp/LexerSourceTest.java
@@ -137,4 +137,9 @@ public class LexerSourceTest {
testLexerSource("5 /*", false, NUMBER, WHITESPACE, INVALID); // Bug #15
testLexerSource("5 //", false, NUMBER, WHITESPACE, CPPCOMMENT);
}
+
+ @Test
+ public void testUnicode()throws Exception{
+ testLexerSource("foo \u2018bar\u2019 baz", true, IDENTIFIER, WHITESPACE, 8216, IDENTIFIER, 8217, WHITESPACE, IDENTIFIER);
+ }
}