summaryrefslogtreecommitdiffstats
path: root/src/java/org/anarres/cpp/LexerSource.java
diff options
context:
space:
mode:
authorShevek <[email protected]>2008-03-21 23:05:04 +0000
committerShevek <[email protected]>2008-03-21 23:05:04 +0000
commit5ff55648127c8a8e1b9829775045af986e37647c (patch)
treeb28209b1efe12824fbdcabd4ee9073e93ca30636 /src/java/org/anarres/cpp/LexerSource.java
parentfca34200881fcaf7b84b4210f7a5f40c8925c4d1 (diff)
move stuff into trunk
Diffstat (limited to 'src/java/org/anarres/cpp/LexerSource.java')
-rw-r--r--src/java/org/anarres/cpp/LexerSource.java677
1 files changed, 677 insertions, 0 deletions
diff --git a/src/java/org/anarres/cpp/LexerSource.java b/src/java/org/anarres/cpp/LexerSource.java
new file mode 100644
index 0000000..a291bff
--- /dev/null
+++ b/src/java/org/anarres/cpp/LexerSource.java
@@ -0,0 +1,677 @@
+/*
+ * Anarres C Preprocessor
+ * Copyright (C) 2007 Shevek
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+package org.anarres.cpp;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+import java.util.Stack;
+
+import static org.anarres.cpp.Token.*;
+
+/** Does not handle digraphs. */
+public class LexerSource extends Source {
+ private static final boolean DEBUG = false;
+
+ private PushbackReader reader;
+ private boolean ppvalid;
+ private boolean bol;
+ private boolean include;
+
+ private int line;
+ private int column;
+ private int lastcolumn;
+ private boolean cr;
+
+ /* ppvalid is:
+ * false in StringLexerSource,
+ * true in FileLexerSource */
+ public LexerSource(Reader r, boolean ppvalid) {
+ this.reader = new PushbackReader(new JoinReader(r), 5);
+ this.ppvalid = ppvalid;
+ this.bol = true;
+ this.include = false;
+
+ this.line = 1;
+ this.column = 0;
+ this.lastcolumn = -1;
+ this.cr = false;
+ }
+
+ @Override
+ public int getLine() {
+ return line;
+ }
+
+ public int getColumn() {
+ return column;
+ }
+
+ /* pp */ boolean isNumbered() {
+ return true;
+ }
+
+/* Error handling - this lot is barely worth it. */
+
+ private final void _error(String msg, boolean error)
+ throws LexerException {
+ int _l = line;
+ int _c = column;
+ if (_c == 0) {
+ _c = lastcolumn;
+ _l--;
+ }
+ else {
+ _c--;
+ }
+ if (error)
+ super.error(_l, _c, msg);
+ else
+ super.warning(_l, _c, msg);
+ }
+
+ private final void error(String msg)
+ throws LexerException {
+ _error(msg, true);
+ }
+
+ private final void warning(String msg)
+ throws LexerException {
+ _error(msg, false);
+ }
+
+/* A flag for string handling. */
+
+ /* pp */ void setInclude(boolean b) {
+ this.include = b;
+ }
+
+/*
+ private boolean _isLineSeparator(int c) {
+ return Character.getType(c) == Character.LINE_SEPARATOR
+ || c == -1;
+ }
+*/
+
+ /* XXX Move to JoinReader and canonicalise newlines. */
+ private static final boolean isLineSeparator(int c) {
+ switch ((char)c) {
+ case '\r':
+ case '\n':
+ case '\u2028':
+ case '\u2029':
+ case '\u000B':
+ case '\u000C':
+ case '\u0085':
+ return true;
+ default:
+ return (c == -1);
+ }
+ }
+
+
+ private int read() throws IOException {
+ int c = reader.read();
+ switch (c) {
+ case '\r':
+ cr = true;
+ line++;
+ lastcolumn = column;
+ column = 0;
+ break;
+ case '\n':
+ if (cr) {
+ cr = false;
+ break;
+ }
+ /* fallthrough */
+ case '\u2028':
+ case '\u2029':
+ case '\u000B':
+ case '\u000C':
+ case '\u0085':
+ cr = false;
+ line++;
+ lastcolumn = column;
+ column = 0;
+ break;
+ default:
+ cr = false;
+ column++;
+ break;
+ }
+
+/*
+ if (isLineSeparator(c)) {
+ line++;
+ lastcolumn = column;
+ column = 0;
+ }
+ else {
+ column++;
+ }
+*/
+
+ return c;
+ }
+
+ /* You can unget AT MOST one newline. */
+ private void unread(int c)
+ throws IOException {
+ if (c != -1) {
+ if (isLineSeparator(c)) {
+ line--;
+ column = lastcolumn;
+ cr = false;
+ }
+ else {
+ column--;
+ }
+ reader.unread(c);
+ }
+ }
+
+ private Token ccomment()
+ throws IOException {
+ StringBuilder text = new StringBuilder("/*");
+ int d;
+ do {
+ do {
+ d = read();
+ text.append((char)d);
+ } while (d != '*');
+ do {
+ d = read();
+ text.append((char)d);
+ } while (d == '*');
+ } while (d != '/');
+ return new Token(COMMENT, text.toString());
+ }
+
+ private Token cppcomment()
+ throws IOException {
+ StringBuilder text = new StringBuilder("//");
+ int d = read();
+ while (!isLineSeparator(d)) {
+ text.append((char)d);
+ d = read();
+ }
+ unread(d);
+ return new Token(COMMENT, text.toString());
+ }
+
+ private int escape(StringBuilder text)
+ throws IOException,
+ LexerException {
+ int d = read();
+ switch (d) {
+ case 'a': text.append('a'); return 0x0a;
+ case 'b': text.append('b'); return '\b';
+ case 'f': text.append('f'); return '\f';
+ case 'n': text.append('n'); return '\n';
+ case 'r': text.append('r'); return '\r';
+ case 't': text.append('t'); return '\t';
+ case 'v': text.append('v'); return 0x0b;
+ case '\\': text.append('\\'); return '\\';
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ int len = 0;
+ int val = 0;
+ do {
+ val = (val << 3) + Character.digit(d, 8);
+ text.append((char)d);
+ d = read();
+ } while (++len < 3 && Character.digit(d, 8) != -1);
+ unread(d);
+ return val;
+
+ case 'x':
+ len = 0;
+ val = 0;
+ do {
+ val = (val << 4) + Character.digit(d, 16);
+ text.append((char)d);
+ d = read();
+ } while (++len < 2 && Character.digit(d, 16) != -1);
+ unread(d);
+ return val;
+
+ /* Exclude two cases from the warning. */
+ case '"': text.append('"'); return '"';
+ case '\'': text.append('\''); return '\'';
+
+ default:
+ warning("Unnecessary escape character " + (char)d);
+ text.append((char)d);
+ return d;
+ }
+ }
+
+ private Token character()
+ throws IOException,
+ LexerException {
+ StringBuilder text = new StringBuilder("'");
+ int d = read();
+ if (d == '\\') {
+ text.append('\\');
+ d = escape(text);
+ }
+ else if (isLineSeparator(d)) {
+ unread(d);
+ error("Unterminated character literal");
+ return new Token(ERROR, text.toString(), null);
+ }
+ else if (d == '\'') {
+ text.append('\'');
+ error("Empty character literal");
+ return new Token(ERROR, text.toString(), null);
+ }
+ else if (!Character.isDefined(d)) {
+ text.append('?');
+ error("Illegal unicode character literal");
+ }
+ else {
+ text.append((char)d);
+ }
+
+ int e = read();
+ if (e != '\'') {
+ unread(e);
+ error("Illegal character constant");
+ /* XXX We could do some patching up here? */
+ return new Token(ERROR, text.toString(), null);
+ }
+ text.append('\'');
+ /* XXX Bad cast. */
+ return new Token(CHARACTER,
+ text.toString(), Character.valueOf((char)d));
+ }
+
+ /* XXX This strips the enclosing quotes from the
+ * returned value. */
+ private Token string(char open, char close)
+ throws IOException,
+ LexerException {
+ StringBuilder text = new StringBuilder();
+ text.append(open);
+
+ StringBuilder buf = new StringBuilder();
+
+ for (;;) {
+ int c = read();
+ if (c == close) {
+ break;
+ }
+ else if (c == '\\') {
+ text.append('\\');
+ if (!include) {
+ char d = (char)escape(text);
+ buf.append(d);
+ }
+ }
+ else if (c == -1) {
+ unread(c);
+ error("End of file in string literal after " + buf);
+ return new Token(ERROR, text.toString(), null);
+ }
+ else if (isLineSeparator(c)) {
+ unread(c);
+ error("Unterminated string literal after " + buf);
+ return new Token(ERROR, text.toString(), null);
+ }
+ else {
+ text.append((char)c);
+ buf.append((char)c);
+ }
+ }
+ text.append(close);
+ return new Token(close == '>' ? HEADER : STRING,
+ text.toString(), buf.toString());
+ }
+
+ private void number_suffix(StringBuilder text, int d)
+ throws IOException {
+ if (d == 'U') {
+ text.append((char)d);
+ d = read();
+ }
+ if (d == 'L') {
+ text.append((char)d);
+ }
+ else if (d == 'I') {
+ text.append((char)d);
+ }
+ else {
+ unread(d);
+ }
+ }
+
+ /* We already chewed a zero, so empty is fine. */
+ private Token number_octal()
+ throws IOException,
+ LexerException {
+ StringBuilder text = new StringBuilder("0");
+ int d = read();
+ long val = 0;
+ while (Character.digit(d, 8) != -1) {
+ val = (val << 3) + Character.digit(d, 8);
+ text.append((char)d);
+ d = read();
+ }
+ number_suffix(text, d);
+ return new Token(INTEGER,
+ text.toString(), Long.valueOf(val));
+ }
+
+ /* We do not know whether know the first digit is valid. */
+ private Token number_hex(char x)
+ throws IOException,
+ LexerException {
+ StringBuilder text = new StringBuilder("0");
+ text.append(x);
+ int d = read();
+ if (Character.digit(d, 16) == -1) {
+ unread(d);
+ error("Illegal hexadecimal constant " + (char)d);
+ return new Token(ERROR, text.toString(), null);
+ }
+ long val = 0;
+ do {
+ val = (val << 4) + Character.digit(d, 16);
+ text.append((char)d);
+ d = read();
+ } while (Character.digit(d, 16) != -1);
+ number_suffix(text, d);
+ return new Token(INTEGER,
+ text.toString(), Long.valueOf(val));
+ }
+
+ /* We know we have at least one valid digit, but empty is not
+ * fine. */
+ /* XXX This needs a complete rewrite. */
+ private Token number_decimal(int c)
+ throws IOException,
+ LexerException {
+ StringBuilder text = new StringBuilder((char)c);
+ int d = c;
+ long val = 0;
+ do {
+ val = val * 10 + Character.digit(d, 10);
+ text.append((char)d);
+ d = read();
+ } while (Character.digit(d, 10) != -1);
+ number_suffix(text, d);
+ return new Token(INTEGER,
+ text.toString(), Long.valueOf(val));
+ }
+
+ private Token identifier(int c)
+ throws IOException,
+ LexerException {
+ StringBuilder text = new StringBuilder();
+ int d;
+ text.append((char)c);
+ for (;;) {
+ d = read();
+ if (Character.isIdentifierIgnorable(d))
+ ;
+ else if (Character.isJavaIdentifierPart(d))
+ text.append((char)d);
+ else
+ break;
+ }
+ unread(d);
+ return new Token(IDENTIFIER, text.toString());
+ }
+
+ private Token whitespace(int c)
+ throws IOException,
+ LexerException {
+ StringBuilder text = new StringBuilder();
+ int d;
+ text.append((char)c);
+ for (;;) {
+ d = read();
+ if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */
+ break;
+ if (Character.isWhitespace(d))
+ text.append((char)d);
+ else
+ break;
+ }
+ unread(d);
+ return new Token(WHITESPACE, text.toString());
+ }
+
+ /* No token processed by cond() contains a newline. */
+ private Token cond(char c, int yes, int no)
+ throws IOException {
+ int d = read();
+ if (c == d)
+ return new Token(yes);
+ unread(d);
+ return new Token(no);
+ }
+
+ public Token token()
+ throws IOException,
+ LexerException {
+ Token tok = null;
+
+ int _l = line;
+ int _c = column;
+
+ int c = read();
+ int d, e;
+
+ switch (c) {
+ case '\n':
+ if (ppvalid) {
+ bol = true;
+ if (include) {
+ tok = new Token(NL, _l, _c, new String("\n"));
+ }
+ else {
+ int nls = 0;
+ do {
+ d = read();
+ nls++;
+ } while (d == '\n');
+ unread(d);
+ char[] text = new char[nls];
+ for (int i = 0; i < text.length; i++)
+ text[i] = '\n';
+ // Skip the bol = false below.
+ tok = new Token(NL, _l, _c, new String(text));
+ }
+ if (DEBUG)
+ System.out.println("lx: Returning NL: " + tok);
+ return tok;
+ }
+ /* Let it be handled as whitespace. */
+ break;
+
+ case '!':
+ tok = cond('=', NE, '!');
+ break;
+
+ case '#':
+ if (bol)
+ tok = new Token(HASH);
+ else
+ tok = cond('#', PASTE, '#');
+ break;
+
+ case '+':
+ d = read();
+ if (d == '+')
+ tok = new Token(INC);
+ else if (d == '=')
+ tok = new Token(PLUS_EQ);
+ else
+ unread(d);
+ break;
+ case '-':
+ d = read();
+ if (d == '-')
+ tok = new Token(DEC);
+ else if (d == '=')
+ tok = new Token(SUB_EQ);
+ else if (d == '>')
+ tok = new Token(ARROW);
+ else
+ unread(d);
+ break;
+
+ case '*':
+ tok = cond('=', MULT_EQ, '*');
+ break;
+ case '/':
+ d = read();
+ if (d == '*')
+ tok = ccomment();
+ else if (d == '/')
+ tok = cppcomment();
+ else if (d == '=')
+ tok = new Token(DIV_EQ);
+ else
+ unread(d);
+ break;
+
+ case '%':
+ tok = cond('=', MOD_EQ, '%');
+ break;
+
+ case ':':
+ /* :: */
+ break;
+
+ case '<':
+ if (include) {
+ tok = string('<', '>');
+ }
+ else {
+ d = read();
+ if (d == '=')
+ tok = new Token(LE);
+ else if (d == '<')
+ tok = cond('=', LSH_EQ, LSH);
+ else
+ unread(d);
+ }
+ break;
+
+ case '=':
+ tok = cond('=', EQ, '=');
+ break;
+
+ case '>':
+ d = read();
+ if (d == '=')
+ tok = new Token(GE);
+ else if (d == '>')
+ tok = cond('=', RSH_EQ, RSH);
+ else
+ unread(d);
+ break;
+
+ case '^':
+ tok = cond('=', XOR_EQ, '^');
+ break;
+
+ case '|':
+ d = read();
+ if (d == '=')
+ tok = new Token(OR_EQ);
+ else if (d == '|')
+ tok = cond('=', LOR_EQ, LOR);
+ else
+ unread(d);
+ break;
+ case '&':
+ d = read();
+ if (d == '&')
+ tok = cond('=', LAND_EQ, LAND);
+ else if (d == '=')
+ tok = new Token(AND_EQ);
+ else
+ unread(d);
+ break;
+
+ case '.':
+ d = read();
+ if (d == '.')
+ tok = cond('.', ELLIPSIS, RANGE);
+ else
+ unread(d);
+ /* XXX decimal fraction */
+ break;
+
+ case '0':
+ /* octal or hex */
+ d = read();
+ if (d == 'x' || d == 'X')
+ tok = number_hex((char)d);
+ else {
+ unread(d);
+ tok = number_octal();
+ }
+ break;
+
+ case '\'':
+ tok = character();
+ break;
+
+ case '"':
+ tok = string('"', '"');
+ break;
+
+ case -1:
+ tok = new Token(EOF, _l, _c, "<eof>");
+ break;
+ }
+
+ if (tok == null) {
+ if (Character.isWhitespace(c)) {
+ tok = whitespace(c);
+ }
+ else if (Character.isDigit(c)) {
+ tok = number_decimal(c);
+ }
+ else if (Character.isJavaIdentifierStart(c)) {
+ tok = identifier(c);
+ }
+ else {
+ tok = new Token(c);
+ }
+ }
+
+ bol = false;
+
+ tok.setLocation(_l, _c);
+ if (DEBUG)
+ System.out.println("lx: Returning " + tok);
+ // (new Exception("here")).printStackTrace(System.out);
+ return tok;
+ }
+
+}