diff options
Diffstat (limited to 'src/main/java/org/anarres/cpp/LexerSource.java')
-rw-r--r-- | src/main/java/org/anarres/cpp/LexerSource.java | 910 |
1 files changed, 910 insertions, 0 deletions
diff --git a/src/main/java/org/anarres/cpp/LexerSource.java b/src/main/java/org/anarres/cpp/LexerSource.java new file mode 100644 index 0000000..ca18314 --- /dev/null +++ b/src/main/java/org/anarres/cpp/LexerSource.java @@ -0,0 +1,910 @@ +/* + * Anarres C Preprocessor + * Copyright (c) 2007-2008, Shevek + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +package org.anarres.cpp; + +import java.io.IOException; +import java.io.Reader; + +import javax.annotation.Nonnull; +import static org.anarres.cpp.Token.*; + +/** Does not handle digraphs. */ +public class LexerSource extends Source { + + private static final boolean DEBUG = false; + + private JoinReader reader; + private final boolean ppvalid; + private boolean bol; + private boolean include; + + private boolean digraphs; + + /* Unread. */ + private int u0, u1; + private int ucount; + + private int line; + private int column; + private int lastcolumn; + private boolean cr; + + /* ppvalid is: + * false in StringLexerSource, + * true in FileLexerSource */ + public LexerSource(Reader r, boolean ppvalid) { + this.reader = new JoinReader(r); + this.ppvalid = ppvalid; + this.bol = true; + this.include = false; + + this.digraphs = true; + + this.ucount = 0; + + this.line = 1; + this.column = 0; + this.lastcolumn = -1; + this.cr = false; + } + + @Override + /* pp */ void init(Preprocessor pp) { + super.init(pp); + this.digraphs = pp.getFeature(Feature.DIGRAPHS); + this.reader.init(pp, this); + } + + @Override + public int getLine() { + return line; + } + + @Override + public int getColumn() { + return column; + } + + @Override + /* pp */ boolean isNumbered() { + return true; + } + + /* Error handling. */ + private void _error(String msg, boolean error) + throws LexerException { + int _l = line; + int _c = column; + if (_c == 0) { + _c = lastcolumn; + _l--; + } else { + _c--; + } + if (error) + super.error(_l, _c, msg); + else + super.warning(_l, _c, msg); + } + + /* Allow JoinReader to call this. */ + /* pp */ final void error(String msg) + throws LexerException { + _error(msg, true); + } + + /* Allow JoinReader to call this. */ + /* pp */ final void warning(String msg) + throws LexerException { + _error(msg, false); + } + + /* A flag for string handling. */ + + /* pp */ void setInclude(boolean b) { + this.include = b; + } + + /* + private boolean _isLineSeparator(int c) { + return Character.getType(c) == Character.LINE_SEPARATOR + || c == -1; + } + */ + + /* XXX Move to JoinReader and canonicalise newlines. */ + private static boolean isLineSeparator(int c) { + switch ((char) c) { + case '\r': + case '\n': + case '\u2028': + case '\u2029': + case '\u000B': + case '\u000C': + case '\u0085': + return true; + default: + return (c == -1); + } + } + + private int read() + throws IOException, + LexerException { + int c; + assert ucount <= 2 : "Illegal ucount: " + ucount; + switch (ucount) { + case 2: + ucount = 1; + c = u1; + break; + case 1: + ucount = 0; + c = u0; + break; + default: + if (reader == null) + c = -1; + else + c = reader.read(); + break; + } + + switch (c) { + case '\r': + cr = true; + line++; + lastcolumn = column; + column = 0; + break; + case '\n': + if (cr) { + cr = false; + break; + } + /* fallthrough */ + case '\u2028': + case '\u2029': + case '\u000B': + case '\u000C': + case '\u0085': + cr = false; + line++; + lastcolumn = column; + column = 0; + break; + case -1: + cr = false; + break; + default: + cr = false; + column++; + break; + } + + /* + if (isLineSeparator(c)) { + line++; + lastcolumn = column; + column = 0; + } + else { + column++; + } + */ + return c; + } + + /* You can unget AT MOST one newline. */ + private void unread(int c) + throws IOException { + /* XXX Must unread newlines. */ + if (c != -1) { + if (isLineSeparator(c)) { + line--; + column = lastcolumn; + cr = false; + } else { + column--; + } + switch (ucount) { + case 0: + u0 = c; + ucount = 1; + break; + case 1: + u1 = c; + ucount = 2; + break; + default: + throw new IllegalStateException( + "Cannot unget another character!" + ); + } + // reader.unread(c); + } + } + + /* Consumes the rest of the current line into an invalid. */ + @Nonnull + private Token invalid(StringBuilder text, String reason) + throws IOException, + LexerException { + int d = read(); + while (!isLineSeparator(d)) { + text.append((char) d); + d = read(); + } + unread(d); + return new Token(INVALID, text.toString(), reason); + } + + @Nonnull + private Token ccomment() + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("/*"); + int d; + do { + do { + d = read(); + text.append((char) d); + } while (d != '*'); + do { + d = read(); + text.append((char) d); + } while (d == '*'); + } while (d != '/'); + return new Token(CCOMMENT, text.toString()); + } + + @Nonnull + private Token cppcomment() + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("//"); + int d = read(); + while (!isLineSeparator(d)) { + text.append((char) d); + d = read(); + } + unread(d); + return new Token(CPPCOMMENT, text.toString()); + } + + private int escape(StringBuilder text) + throws IOException, + LexerException { + int d = read(); + switch (d) { + case 'a': + text.append('a'); + return 0x07; + case 'b': + text.append('b'); + return '\b'; + case 'f': + text.append('f'); + return '\f'; + case 'n': + text.append('n'); + return '\n'; + case 'r': + text.append('r'); + return '\r'; + case 't': + text.append('t'); + return '\t'; + case 'v': + text.append('v'); + return 0x0b; + case '\\': + text.append('\\'); + return '\\'; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + int len = 0; + int val = 0; + do { + val = (val << 3) + Character.digit(d, 8); + text.append((char) d); + d = read(); + } while (++len < 3 && Character.digit(d, 8) != -1); + unread(d); + return val; + + case 'x': + text.append((char) d); + len = 0; + val = 0; + while (len++ < 2) { + d = read(); + if (Character.digit(d, 16) == -1) { + unread(d); + break; + } + val = (val << 4) + Character.digit(d, 16); + text.append((char) d); + } + return val; + + /* Exclude two cases from the warning. */ + case '"': + text.append('"'); + return '"'; + case '\'': + text.append('\''); + return '\''; + + default: + warning("Unnecessary escape character " + (char) d); + text.append((char) d); + return d; + } + } + + @Nonnull + private Token character() + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("'"); + int d = read(); + if (d == '\\') { + text.append('\\'); + d = escape(text); + } else if (isLineSeparator(d)) { + unread(d); + return new Token(INVALID, text.toString(), + "Unterminated character literal"); + } else if (d == '\'') { + text.append('\''); + return new Token(INVALID, text.toString(), + "Empty character literal"); + } else if (!Character.isDefined(d)) { + text.append('?'); + return invalid(text, "Illegal unicode character literal"); + } else { + text.append((char) d); + } + + int e = read(); + if (e != '\'') { + // error("Illegal character constant"); + /* We consume up to the next ' or the rest of the line. */ + for (;;) { + if (isLineSeparator(e)) { + unread(e); + break; + } + text.append((char) e); + if (e == '\'') + break; + e = read(); + } + return new Token(INVALID, text.toString(), + "Illegal character constant " + text); + } + text.append('\''); + /* XXX It this a bad cast? */ + return new Token(CHARACTER, + text.toString(), Character.valueOf((char) d)); + } + + @Nonnull + private Token string(char open, char close) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder(); + text.append(open); + + StringBuilder buf = new StringBuilder(); + + for (;;) { + int c = read(); + if (c == close) { + break; + } else if (c == '\\') { + text.append('\\'); + if (!include) { + char d = (char) escape(text); + buf.append(d); + } + } else if (c == -1) { + unread(c); + // error("End of file in string literal after " + buf); + return new Token(INVALID, text.toString(), + "End of file in string literal after " + buf); + } else if (isLineSeparator(c)) { + unread(c); + // error("Unterminated string literal after " + buf); + return new Token(INVALID, text.toString(), + "Unterminated string literal after " + buf); + } else { + text.append((char) c); + buf.append((char) c); + } + } + text.append(close); + switch (close) { + case '"': + return new Token(STRING, + text.toString(), buf.toString()); + case '>': + return new Token(HEADER, + text.toString(), buf.toString()); + case '\'': + if (buf.length() == 1) + return new Token(CHARACTER, + text.toString(), buf.toString()); + return new Token(SQSTRING, + text.toString(), buf.toString()); + default: + throw new IllegalStateException( + "Unknown closing character " + String.valueOf(close)); + } + } + + @Nonnull + private Token _number_suffix(StringBuilder text, NumericValue value, int d) + throws IOException, + LexerException { + int flags = 0; // U, I, L, LL, F, D, MSB + for (;;) { + if (d == 'U' || d == 'u') { + if ((flags & NumericValue.F_UNSIGNED) != 0) + warning("Duplicate unsigned suffix " + d); + flags |= NumericValue.F_UNSIGNED; + text.append((char) d); + d = read(); + } else if (d == 'L' || d == 'l') { + if ((flags & NumericValue.FF_SIZE) != 0) + warning("Nultiple length suffixes after " + text); + text.append((char) d); + int e = read(); + if (e == d) { // Case must match. Ll is Welsh. + flags |= NumericValue.F_LONGLONG; + text.append((char) e); + d = read(); + } else { + flags |= NumericValue.F_LONG; + d = e; + } + } else if (d == 'I' || d == 'i') { + if ((flags & NumericValue.FF_SIZE) != 0) + warning("Nultiple length suffixes after " + text); + flags |= NumericValue.F_INT; + text.append((char) d); + d = read(); + } else if (d == 'F' || d == 'f') { + if ((flags & NumericValue.FF_SIZE) != 0) + warning("Nultiple length suffixes after " + text); + flags |= NumericValue.F_FLOAT; + text.append((char) d); + d = read(); + } else if (d == 'D' || d == 'd') { + if ((flags & NumericValue.FF_SIZE) != 0) + warning("Nultiple length suffixes after " + text); + flags |= NumericValue.F_DOUBLE; + text.append((char) d); + d = read(); + } // This should probably be isPunct() || isWhite(). + else if (Character.isLetter(d) || d == '_') { + unread(d); + value.setFlags(flags); + return invalid(text, + "Invalid suffix \"" + (char) d + + "\" on numeric constant"); + } else { + unread(d); + value.setFlags(flags); + return new Token(NUMBER, + text.toString(), value); + } + } + } + + /* Either a decimal part, or a hex exponent. */ + @Nonnull + private String _number_part(StringBuilder text, int base) + throws IOException, + LexerException { + StringBuilder part = new StringBuilder(); + int d = read(); + while (Character.digit(d, base) != -1) { + text.append((char) d); + part.append((char) d); + d = read(); + } + unread(d); + return part.toString(); + } + + /* We already chewed a zero, so empty is fine. */ + @Nonnull + private Token number_octal() + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("0"); + String integer = _number_part(text, 8); + int d = read(); + NumericValue value = new NumericValue(8, integer); + return _number_suffix(text, value, d); + } + + /* We do not know whether know the first digit is valid. */ + @Nonnull + private Token number_hex(char x) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder("0"); + text.append(x); + String integer = _number_part(text, 16); + NumericValue value = new NumericValue(16, integer); + int d = read(); + if (d == '.') { + String fraction = _number_part(text, 16); + value.setFractionalPart(fraction); + d = read(); + } + if (d == 'P' || d == 'p') { + String exponent = _number_part(text, 10); + value.setExponent(exponent); + d = read(); + } + // XXX Make sure it's got enough parts + return _number_suffix(text, value, d); + } + + /* We know we have at least one valid digit, but empty is not + * fine. */ + @Nonnull + private Token number_decimal() + throws IOException, + LexerException { + StringBuilder text = new StringBuilder(); + String integer = _number_part(text, 10); + NumericValue value = new NumericValue(10, integer); + int d = read(); + if (d == '.') { + String fraction = _number_part(text, 10); + value.setFractionalPart(fraction); + d = read(); + } + if (d == 'E' || d == 'e') { + String exponent = _number_part(text, 10); + value.setExponent(exponent); + d = read(); + } + // XXX Make sure it's got enough parts + return _number_suffix(text, value, d); + } + + @Nonnull + private Token identifier(int c) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder(); + int d; + text.append((char) c); + for (;;) { + d = read(); + if (Character.isIdentifierIgnorable(d)) + ; else if (Character.isJavaIdentifierPart(d)) + text.append((char) d); + else + break; + } + unread(d); + return new Token(IDENTIFIER, text.toString()); + } + + @Nonnull + private Token whitespace(int c) + throws IOException, + LexerException { + StringBuilder text = new StringBuilder(); + int d; + text.append((char) c); + for (;;) { + d = read(); + if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */ + + break; + if (Character.isWhitespace(d)) + text.append((char) d); + else + break; + } + unread(d); + return new Token(WHITESPACE, text.toString()); + } + + /* No token processed by cond() contains a newline. */ + @Nonnull + private Token cond(char c, int yes, int no) + throws IOException, + LexerException { + int d = read(); + if (c == d) + return new Token(yes); + unread(d); + return new Token(no); + } + + @Override + public Token token() + throws IOException, + LexerException { + Token tok = null; + + int _l = line; + int _c = column; + + int c = read(); + int d; + + switch (c) { + case '\n': + if (ppvalid) { + bol = true; + if (include) { + tok = new Token(NL, _l, _c, "\n"); + } else { + int nls = 0; + do { + nls++; + d = read(); + } while (d == '\n'); + unread(d); + char[] text = new char[nls]; + for (int i = 0; i < text.length; i++) + text[i] = '\n'; + // Skip the bol = false below. + tok = new Token(NL, _l, _c, new String(text)); + } + if (DEBUG) + System.out.println("lx: Returning NL: " + tok); + return tok; + } + /* Let it be handled as whitespace. */ + break; + + case '!': + tok = cond('=', NE, '!'); + break; + + case '#': + if (bol) + tok = new Token(HASH); + else + tok = cond('#', PASTE, '#'); + break; + + case '+': + d = read(); + if (d == '+') + tok = new Token(INC); + else if (d == '=') + tok = new Token(PLUS_EQ); + else + unread(d); + break; + case '-': + d = read(); + if (d == '-') + tok = new Token(DEC); + else if (d == '=') + tok = new Token(SUB_EQ); + else if (d == '>') + tok = new Token(ARROW); + else + unread(d); + break; + + case '*': + tok = cond('=', MULT_EQ, '*'); + break; + case '/': + d = read(); + if (d == '*') + tok = ccomment(); + else if (d == '/') + tok = cppcomment(); + else if (d == '=') + tok = new Token(DIV_EQ); + else + unread(d); + break; + + case '%': + d = read(); + if (d == '=') + tok = new Token(MOD_EQ); + else if (digraphs && d == '>') + tok = new Token('}'); // digraph + else if (digraphs && d == ':') + PASTE: + { + d = read(); + if (d != '%') { + unread(d); + tok = new Token('#'); // digraph + break PASTE; + } + d = read(); + if (d != ':') { + unread(d); // Unread 2 chars here. + unread('%'); + tok = new Token('#'); // digraph + break PASTE; + } + tok = new Token(PASTE); // digraph + } + else + unread(d); + break; + + case ':': + /* :: */ + d = read(); + if (digraphs && d == '>') + tok = new Token(']'); // digraph + else + unread(d); + break; + + case '<': + if (include) { + tok = string('<', '>'); + } else { + d = read(); + if (d == '=') + tok = new Token(LE); + else if (d == '<') + tok = cond('=', LSH_EQ, LSH); + else if (digraphs && d == ':') + tok = new Token('['); // digraph + else if (digraphs && d == '%') + tok = new Token('{'); // digraph + else + unread(d); + } + break; + + case '=': + tok = cond('=', EQ, '='); + break; + + case '>': + d = read(); + if (d == '=') + tok = new Token(GE); + else if (d == '>') + tok = cond('=', RSH_EQ, RSH); + else + unread(d); + break; + + case '^': + tok = cond('=', XOR_EQ, '^'); + break; + + case '|': + d = read(); + if (d == '=') + tok = new Token(OR_EQ); + else if (d == '|') + tok = cond('=', LOR_EQ, LOR); + else + unread(d); + break; + case '&': + d = read(); + if (d == '&') + tok = cond('=', LAND_EQ, LAND); + else if (d == '=') + tok = new Token(AND_EQ); + else + unread(d); + break; + + case '.': + d = read(); + if (d == '.') + tok = cond('.', ELLIPSIS, RANGE); + else + unread(d); + if (Character.isDigit(d)) { + unread('.'); + tok = number_decimal(); + } + /* XXX decimal fraction */ + break; + + case '0': + /* octal or hex */ + d = read(); + if (d == 'x' || d == 'X') + tok = number_hex((char) d); + else { + unread(d); + tok = number_octal(); + } + break; + + case '\'': + tok = string('\'', '\''); + break; + + case '"': + tok = string('"', '"'); + break; + + case -1: + close(); + tok = new Token(EOF, _l, _c, "<eof>"); + break; + } + + if (tok == null) { + if (Character.isWhitespace(c)) { + tok = whitespace(c); + } else if (Character.isDigit(c)) { + unread(c); + tok = number_decimal(); + } else if (Character.isJavaIdentifierStart(c)) { + tok = identifier(c); + } else { + tok = new Token(c); + } + } + + if (bol) { + switch (tok.getType()) { + case WHITESPACE: + case CCOMMENT: + break; + default: + bol = false; + break; + } + } + + tok.setLocation(_l, _c); + if (DEBUG) + System.out.println("lx: Returning " + tok); + // (new Exception("here")).printStackTrace(System.out); + return tok; + } + + public void close() + throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + super.close(); + } + +} |