diff options
Diffstat (limited to 'src/java/org/anarres/cpp/LexerSource.java')
-rw-r--r-- | src/java/org/anarres/cpp/LexerSource.java | 677 |
1 files changed, 677 insertions, 0 deletions
diff --git a/src/java/org/anarres/cpp/LexerSource.java b/src/java/org/anarres/cpp/LexerSource.java new file mode 100644 index 0000000..a291bff --- /dev/null +++ b/src/java/org/anarres/cpp/LexerSource.java @@ -0,0 +1,677 @@ +/* + * Anarres C Preprocessor + * Copyright (C) 2007 Shevek + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +package org.anarres.cpp; + +import java.io.File; +import java.io.IOException; +import java.io.PushbackReader; +import java.io.Reader; +import java.util.Stack; + +import static org.anarres.cpp.Token.*; + +/** Does not handle digraphs. */ +public class LexerSource extends Source { + private static final boolean DEBUG = false; + + private PushbackReader reader; + private boolean ppvalid; + private boolean bol; + private boolean include; + + private int line; + private int column; + private int lastcolumn; + private boolean cr; + + /* ppvalid is: + * false in StringLexerSource, + * true in FileLexerSource */ + public LexerSource(Reader r, boolean ppvalid) { + this.reader = new PushbackReader(new JoinReader(r), 5); + this.ppvalid = ppvalid; + this.bol = true; + this.include = false; + + this.line = 1; + this.column = 0; + this.lastcolumn = -1; + this.cr = false; + } + + @Override + public int getLine() { + return line; + } + + public int getColumn() { + return column; + } + + /* pp */ boolean isNumbered() { + return true; + } + +/* Error handling - this lot is barely worth it. */ + + private final void _error(String msg, boolean error) + throws LexerException { + int _l = line; + int _c = column; + if (_c == 0) { + _c = lastcolumn; + _l--; + } + else { + _c--; + } + if (error) + super.error(_l, _c, msg); + else + super.warning(_l, _c, msg); + } + + private final void error(String msg) + throws LexerException { + _error(msg, true); + } + + private final void warning(String msg) + throws LexerException { + _error(msg, false); + } + +/* A flag for string handling. */ + + /* pp */ void setInclude(boolean b) { + this.include = b; + } + +/* + private boolean _isLineSeparator(int c) { + return Character.getType(c) == Character.LINE_SEPARATOR + || c == -1; + } +*/ + + /* XXX Move to JoinReader and canonicalise newlines. */ + private static final boolean isLineSeparator(int c) { + switch ((char)c) { + case '\r': + case '\n': + case '\u2028': + case '\u2029': + case '\u000B': + case '\u000C': + case '\u0085': + return true; + default: + return (c == -1); + } + } + + + private int read() throws IOException { + int c = reader.read(); + switch (c) { + case '\r': + cr = true; + line++; + lastcolumn = column; + column = 0; + break; + case '\n': + if (cr) { + cr = false; + break; + } + /* fallthrough */ + case '\u2028': + case '\u2029': + case '\u000B': + case '\u000C': + case '\u0085': + cr = false; + line++; + lastcolumn = column; + column = 0; + break; + default: + cr = false; + column++; + break; + } + +/* + if (isLineSeparator(c)) { + line++; + lastcolumn = column; + column = 0; + } + else { + column++; + } +*/ + + return c; + } + + /* You can unget AT MOST one newline. */ + private void unread(int c) + throws IOException { + if (c != -1) { + if (isLineSeparator(c)) { + line--; + column = lastcolumn; + cr = false; + } + else { + column--; + } + reader.unread(c); + } + } + + private Token ccomment() + throws IOException { + StringBuilder text = new StringBuilder("/*"); + int d; + do { + do { + d = read(); + text.append((char)d); + } while (d != '*'); + do { + d = read(); + text.append((char)d); + } while (d == '*'); + } while (d != '/'); + return new Token(COMMENT, text.toString()); + } + + private Token cppcomment() + throws IOException { + StringBuilder text = new StringBuilder("//"); + int d = read(); + while (!isLineSeparator(d)) { + text.append((char)d); + d = read(); + } + unread(d); + return new Token(COMMENT, text.toString()); + } + + private int escape(StringBuilder text) + throws IOException, + LexerException { + int d = read(); + switch (d) { + case 'a': text.append('a'); return 0x0a; + case 'b': text.append('b'); return '\b'; + case 'f': text.append('f'); return '\f'; + case 'n': text.append('n'); return '\n'; + case 'r': text.append('r'); return '\r'; + case 't': text.append('t'); return '\t'; + case 'v': text.append('v'); return 0x0b; + case '\\': text.append('\\'); return '\\'; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + int len = 0; + int val = 0; + do { + val = (val << 3) + Character.digit(d, 8); + text.append((char)d); + d = read(); + } while (++len < 3 && Character.digit(d, 8) != -1); + unread(d); + return val; + + case 'x': + len = 0; + val = 0; + do { + val = (val << 4) + Character.digit(d, 16); + text.append((char)d); + d = read(); + } while (++len < 2 && Character.digit(d, 16) != -1); + unread(d); + return val; + + /* Exclude two cases from the warning. */ + case '"': text.append('"'); return '"'; + case '\'': text.append('\''); return '\''; + + default: + warning("Unnecessary escape character " + (char)d); + text.append((char)d); + return d; + } + } + + private Token character() + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("'"); + int d = read(); + if (d == '\\') { + text.append('\\'); + d = escape(text); + } + else if (isLineSeparator(d)) { + unread(d); + error("Unterminated character literal"); + return new Token(ERROR, text.toString(), null); + } + else if (d == '\'') { + text.append('\''); + error("Empty character literal"); + return new Token(ERROR, text.toString(), null); + } + else if (!Character.isDefined(d)) { + text.append('?'); + error("Illegal unicode character literal"); + } + else { + text.append((char)d); + } + + int e = read(); + if (e != '\'') { + unread(e); + error("Illegal character constant"); + /* XXX We could do some patching up here? */ + return new Token(ERROR, text.toString(), null); + } + text.append('\''); + /* XXX Bad cast. */ + return new Token(CHARACTER, + text.toString(), Character.valueOf((char)d)); + } + + /* XXX This strips the enclosing quotes from the + * returned value. */ + private Token string(char open, char close) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder(); + text.append(open); + + StringBuilder buf = new StringBuilder(); + + for (;;) { + int c = read(); + if (c == close) { + break; + } + else if (c == '\\') { + text.append('\\'); + if (!include) { + char d = (char)escape(text); + buf.append(d); + } + } + else if (c == -1) { + unread(c); + error("End of file in string literal after " + buf); + return new Token(ERROR, text.toString(), null); + } + else if (isLineSeparator(c)) { + unread(c); + error("Unterminated string literal after " + buf); + return new Token(ERROR, text.toString(), null); + } + else { + text.append((char)c); + buf.append((char)c); + } + } + text.append(close); + return new Token(close == '>' ? HEADER : STRING, + text.toString(), buf.toString()); + } + + private void number_suffix(StringBuilder text, int d) + throws IOException { + if (d == 'U') { + text.append((char)d); + d = read(); + } + if (d == 'L') { + text.append((char)d); + } + else if (d == 'I') { + text.append((char)d); + } + else { + unread(d); + } + } + + /* We already chewed a zero, so empty is fine. */ + private Token number_octal() + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("0"); + int d = read(); + long val = 0; + while (Character.digit(d, 8) != -1) { + val = (val << 3) + Character.digit(d, 8); + text.append((char)d); + d = read(); + } + number_suffix(text, d); + return new Token(INTEGER, + text.toString(), Long.valueOf(val)); + } + + /* We do not know whether know the first digit is valid. */ + private Token number_hex(char x) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("0"); + text.append(x); + int d = read(); + if (Character.digit(d, 16) == -1) { + unread(d); + error("Illegal hexadecimal constant " + (char)d); + return new Token(ERROR, text.toString(), null); + } + long val = 0; + do { + val = (val << 4) + Character.digit(d, 16); + text.append((char)d); + d = read(); + } while (Character.digit(d, 16) != -1); + number_suffix(text, d); + return new Token(INTEGER, + text.toString(), Long.valueOf(val)); + } + + /* We know we have at least one valid digit, but empty is not + * fine. */ + /* XXX This needs a complete rewrite. */ + private Token number_decimal(int c) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder((char)c); + int d = c; + long val = 0; + do { + val = val * 10 + Character.digit(d, 10); + text.append((char)d); + d = read(); + } while (Character.digit(d, 10) != -1); + number_suffix(text, d); + return new Token(INTEGER, + text.toString(), Long.valueOf(val)); + } + + private Token identifier(int c) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder(); + int d; + text.append((char)c); + for (;;) { + d = read(); + if (Character.isIdentifierIgnorable(d)) + ; + else if (Character.isJavaIdentifierPart(d)) + text.append((char)d); + else + break; + } + unread(d); + return new Token(IDENTIFIER, text.toString()); + } + + private Token whitespace(int c) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder(); + int d; + text.append((char)c); + for (;;) { + d = read(); + if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */ + break; + if (Character.isWhitespace(d)) + text.append((char)d); + else + break; + } + unread(d); + return new Token(WHITESPACE, text.toString()); + } + + /* No token processed by cond() contains a newline. */ + private Token cond(char c, int yes, int no) + throws IOException { + int d = read(); + if (c == d) + return new Token(yes); + unread(d); + return new Token(no); + } + + public Token token() + throws IOException, + LexerException { + Token tok = null; + + int _l = line; + int _c = column; + + int c = read(); + int d, e; + + switch (c) { + case '\n': + if (ppvalid) { + bol = true; + if (include) { + tok = new Token(NL, _l, _c, new String("\n")); + } + else { + int nls = 0; + do { + d = read(); + nls++; + } while (d == '\n'); + unread(d); + char[] text = new char[nls]; + for (int i = 0; i < text.length; i++) + text[i] = '\n'; + // Skip the bol = false below. + tok = new Token(NL, _l, _c, new String(text)); + } + if (DEBUG) + System.out.println("lx: Returning NL: " + tok); + return tok; + } + /* Let it be handled as whitespace. */ + break; + + case '!': + tok = cond('=', NE, '!'); + break; + + case '#': + if (bol) + tok = new Token(HASH); + else + tok = cond('#', PASTE, '#'); + break; + + case '+': + d = read(); + if (d == '+') + tok = new Token(INC); + else if (d == '=') + tok = new Token(PLUS_EQ); + else + unread(d); + break; + case '-': + d = read(); + if (d == '-') + tok = new Token(DEC); + else if (d == '=') + tok = new Token(SUB_EQ); + else if (d == '>') + tok = new Token(ARROW); + else + unread(d); + break; + + case '*': + tok = cond('=', MULT_EQ, '*'); + break; + case '/': + d = read(); + if (d == '*') + tok = ccomment(); + else if (d == '/') + tok = cppcomment(); + else if (d == '=') + tok = new Token(DIV_EQ); + else + unread(d); + break; + + case '%': + tok = cond('=', MOD_EQ, '%'); + break; + + case ':': + /* :: */ + break; + + case '<': + if (include) { + tok = string('<', '>'); + } + else { + d = read(); + if (d == '=') + tok = new Token(LE); + else if (d == '<') + tok = cond('=', LSH_EQ, LSH); + else + unread(d); + } + break; + + case '=': + tok = cond('=', EQ, '='); + break; + + case '>': + d = read(); + if (d == '=') + tok = new Token(GE); + else if (d == '>') + tok = cond('=', RSH_EQ, RSH); + else + unread(d); + break; + + case '^': + tok = cond('=', XOR_EQ, '^'); + break; + + case '|': + d = read(); + if (d == '=') + tok = new Token(OR_EQ); + else if (d == '|') + tok = cond('=', LOR_EQ, LOR); + else + unread(d); + break; + case '&': + d = read(); + if (d == '&') + tok = cond('=', LAND_EQ, LAND); + else if (d == '=') + tok = new Token(AND_EQ); + else + unread(d); + break; + + case '.': + d = read(); + if (d == '.') + tok = cond('.', ELLIPSIS, RANGE); + else + unread(d); + /* XXX decimal fraction */ + break; + + case '0': + /* octal or hex */ + d = read(); + if (d == 'x' || d == 'X') + tok = number_hex((char)d); + else { + unread(d); + tok = number_octal(); + } + break; + + case '\'': + tok = character(); + break; + + case '"': + tok = string('"', '"'); + break; + + case -1: + tok = new Token(EOF, _l, _c, "<eof>"); + break; + } + + if (tok == null) { + if (Character.isWhitespace(c)) { + tok = whitespace(c); + } + else if (Character.isDigit(c)) { + tok = number_decimal(c); + } + else if (Character.isJavaIdentifierStart(c)) { + tok = identifier(c); + } + else { + tok = new Token(c); + } + } + + bol = false; + + tok.setLocation(_l, _c); + if (DEBUG) + System.out.println("lx: Returning " + tok); + // (new Exception("here")).printStackTrace(System.out); + return tok; + } + +} |