/**************************************************************************** ** ** Copyright (C) 2017 The Qt Company Ltd. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of Qt Creator. ** ** Commercial License Usage ** Licensees holding valid commercial Qt licenses may use this file in ** accordance with the commercial license agreement provided with the ** Software or, alternatively, in accordance with the terms contained in ** a written agreement between you and The Qt Company. For licensing terms ** and conditions see https://www.qt.io/terms-conditions. For further ** information use the contact form at https://www.qt.io/contact-us. ** ** GNU General Public License Usage ** Alternatively, this file may be used under the terms of the GNU ** General Public License version 3 as published by the Free Software ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT ** included in the packaging of this file. Please review the following ** information to ensure the GNU General Public License requirements will ** be met: https://www.gnu.org/licenses/gpl-3.0.html. ** ****************************************************************************/ #include #include #include using namespace Haskell::Internal; const QSet escapes{'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'}; struct TokenInfo { TokenType type; int column; QString text; }; Q_DECLARE_METATYPE(TokenInfo) bool operator==(const TokenInfo &info, const Token &token) { return info.type == token.type && info.column == token.startCol && info.text.length() == token.length && info.text == token.text.toString(); } bool operator==(const Token &token, const TokenInfo &info) { return info == token; } class tst_Tokenizer : public QObject { Q_OBJECT private slots: void singleLineComment_data(); void singleLineComment(); void multiLineComment_data(); void multiLineComment(); void string_data(); void string(); void character_data(); void character(); void number_data(); void number(); void keyword_data(); void keyword(); void variable_data(); void variable(); void constructor_data(); void constructor(); void op_data(); void op(); private: void setupData(); void addRow(const char *name, const QString &input, const QList &tokens, Tokens::State startState = Tokens::State::None, Tokens::State endState = Tokens::State::None); void checkData(); }; void tst_Tokenizer::setupData() { QTest::addColumn("input"); QTest::addColumn>("output"); QTest::addColumn("startState"); QTest::addColumn("endState"); } void tst_Tokenizer::addRow(const char *name, const QString &input, const QList &tokens, Tokens::State startState, Tokens::State endState) { QTest::newRow(name) << input << tokens << int(startState) << int(endState); } void tst_Tokenizer::checkData() { QFETCH(QString, input); QFETCH(QList, output); QFETCH(int, startState); QFETCH(int, endState); const Tokens tokens = HaskellTokenizer::tokenize(input, startState); QCOMPARE(tokens.length(), output.length()); QCOMPARE(tokens.state, endState); for (int i = 0; i < tokens.length(); ++i) { const Token t = tokens.at(i); const TokenInfo ti = output.at(i); QVERIFY2(t == ti, QString("Token at index %1 does not match, {%2, %3, \"%4\"} != {%5, %6, \"%7\"}") .arg(i) .arg(int(t.type)).arg(t.startCol).arg(t.text.toString()) .arg(int(ti.type)).arg(ti.column).arg(ti.text) .toUtf8().constData()); } } void tst_Tokenizer::singleLineComment_data() { setupData(); addRow("simple", " -- foo", { {TokenType::Whitespace, 0, " "}, {TokenType::SingleLineComment, 1, "-- foo"} }); addRow("dash, id", "--foo", { {TokenType::SingleLineComment, 0, "--foo"} }); addRow("dash, space, op", "-- |foo", { {TokenType::SingleLineComment, 0, "-- |foo"} }); addRow("multi-dash, space", "---- foo", { {TokenType::SingleLineComment, 0, "---- foo"} }); addRow("dash, op", "--| foo", { {TokenType::Operator, 0, "--|"}, {TokenType::Whitespace, 3, " "}, {TokenType::Variable, 4, "foo"} }); addRow("dash, special", "--(foo", { {TokenType::SingleLineComment, 0, "--(foo"} }); addRow("not a qualified varsym", "F.-- foo", { {TokenType::Constructor, 0, "F"}, {TokenType::Operator, 1, "."}, {TokenType::SingleLineComment, 2, "-- foo"} }); } void tst_Tokenizer::singleLineComment() { checkData(); } void tst_Tokenizer::multiLineComment_data() { setupData(); addRow("trailing dashes", "{---foo -}", { {TokenType::MultiLineComment, 0, "{---foo -}"} }); addRow("multiline", "{- foo", { {TokenType::MultiLineComment, 0, "{- foo"} }, Tokens::State::None, Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1)); addRow("multiline2", "bar -}", { {TokenType::MultiLineComment, 0, "bar -}"} }, Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1), Tokens::State::None); addRow("nested", "{- fo{-o", { {TokenType::MultiLineComment, 0, "{- fo{-o"} }, Tokens::State::None, Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2)); addRow("nested2", "bar -}", { {TokenType::MultiLineComment, 0, "bar -}"} }, Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2), Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1)); addRow("nested3", "bar -}", { {TokenType::MultiLineComment, 0, "bar -}"} }, Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1), Tokens::State::None); } void tst_Tokenizer::multiLineComment() { checkData(); } void tst_Tokenizer::string_data() { setupData(); addRow("simple", "\"foo\"", { {TokenType::String, 0, "\"foo\""} }); addRow("unterminated", "\"", { {TokenType::StringError, 0, "\""} }); addRow("unterminated2", "\"foo", { {TokenType::String, 0, "\"fo"}, {TokenType::StringError, 3, "o"} }); addRow("unterminated with escape", "\"\\\\", { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, "\\"}, {TokenType::StringError, 2, "\\"} }); // gaps addRow("gap", "\" \\ \\\"", { {TokenType::String, 0, "\" \\ \\\""} }); addRow("gap over endline", "\"foo\\", { {TokenType::String, 0, "\"foo\\"} }, Tokens::State::None, Tokens::State::StringGap); addRow("gap over endline2", "\\foo\"", { {TokenType::String, 0, "\\foo\""} }, Tokens::State::StringGap, Tokens::State::None); addRow("gap error", "\"\\ ab \\\"", { {TokenType::String, 0, "\"\\ "}, {TokenType::StringError, 3, "ab"}, {TokenType::String, 5, " \\\""} }); addRow("gap error with quote", "\"\\ \"", { {TokenType::String, 0, "\"\\ "}, {TokenType::StringError, 3, "\""} }, Tokens::State::None, Tokens::State::StringGap); // char escapes (including wrong ones) for (char c = '!'; c <= '~'; ++c) { // skip uppercase and '^', since these can be part of ascii escapes // and 'o' and 'x' since they start octal and hex escapes // and digits as part of decimal escape if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x') continue; const QChar qc(c); const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8(); const QString input = QString("\"\\%1\"").arg(qc); if (escapes.contains(c)) { addRow(name.constData(), input, { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc}, {TokenType::String, 3, "\""} }); } else { addRow(name.constData(), input, { {TokenType::String, 0, "\"\\"}, {TokenType::StringError, 2, qc}, {TokenType::String, 3, "\""} }); } } addRow("decimal escape", "\"\\1234a\"", { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, "\\1234"}, {TokenType::String, 6, "a\""} }); addRow("octal escape", "\"\\o0678a\"", { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, "\\o067"}, {TokenType::String, 6, "8a\""} }); addRow("octal escape error", "\"\\o8a\"", { {TokenType::String, 0, "\"\\"}, {TokenType::StringError, 2, "o"}, {TokenType::String, 3, "8a\""} }); addRow("hexadecimal escape", "\"\\x0678Abg\"", { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, "\\x0678Ab"}, {TokenType::String, 9, "g\""} }); addRow("hexadecimal escape error", "\"\\xg\"", { {TokenType::String, 0, "\"\\"}, {TokenType::StringError, 2, "x"}, {TokenType::String, 3, "g\""} }); // ascii cntrl escapes (including wrong ones) for (char c = '!'; c <= '~'; ++c) { if (c == '"') // is special because it also ends the string continue; const QChar qc(c); const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8(); const QString input = QString("\"\\^%1\"").arg(qc); if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']' || qc == '^' || qc == '_') { addRow(name.constData(), input, { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc}, {TokenType::String, 4, "\""} }); } else { addRow(name.constData(), input, { {TokenType::String, 0, "\"\\"}, {TokenType::StringError, 2, "^"}, {TokenType::String, 3, QString(qc) + "\""} }); } } addRow("ascii escape SOH", "\"\\SOHN\"", { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, "\\SOH"}, {TokenType::String, 5, "N\""} }); addRow("ascii escape SO", "\"\\SON\"", { {TokenType::String, 0, "\""}, {TokenType::EscapeSequence, 1, "\\SO"}, {TokenType::String, 4, "N\""} }); addRow("ascii escape error", "\"\\TON\"", { {TokenType::String, 0, "\"\\"}, {TokenType::StringError, 2, "T"}, {TokenType::String, 3, "ON\""} }); addRow("ascii escape error 2", "\"\\STO\"", { {TokenType::String, 0, "\"\\"}, {TokenType::StringError, 2, "S"}, {TokenType::String, 3, "TO\""} }); } void tst_Tokenizer::string() { checkData(); } void tst_Tokenizer::character_data() { setupData(); addRow("simple", "'a'", { {TokenType::Char, 0, "'a'"} }); addRow("too many", "'abc'", { {TokenType::Char, 0, "'a"}, {TokenType::CharError, 2, "bc"}, {TokenType::Char, 4, "'"} }); addRow("too few", "''", { {TokenType::Char, 0, "'"}, {TokenType::CharError, 1, "'"} }); addRow("only quote", "'", { {TokenType::CharError, 0, "'"} }); addRow("unterminated", "'a", { {TokenType::Char, 0, "'"}, {TokenType::CharError, 1, "a"} }); addRow("unterminated too many", "'abc", { {TokenType::Char, 0, "'a"}, {TokenType::CharError, 2, "bc"} }); addRow("unterminated backslash", "'\\", { {TokenType::Char, 0, "'"}, {TokenType::CharError, 1, "\\"} }); // char escapes (including wrong ones) for (char c = '!'; c <= '~'; ++c) { // skip uppercase and '^', since these can be part of ascii escapes // and 'o' and 'x' since they start octal and hex escapes // and digits as part of decimal escape if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x') continue; const QChar qc(c); const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8(); const QString input = QString("'\\%1'").arg(qc); if (c != '&' && escapes.contains(c)) { addRow(name.constData(), input, { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc}, {TokenType::Char, 3, "'"} }); } else { addRow(name.constData(), input, { {TokenType::Char, 0, "'\\"}, {TokenType::CharError, 2, qc}, {TokenType::Char, 3, "'"} }); } } addRow("decimal escape", "'\\1234'", { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, "\\1234"}, {TokenType::Char, 6, "'"} }); addRow("decimal escape too long", "'\\1234a'", { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, "\\1234"}, {TokenType::CharError, 6, "a"}, {TokenType::Char, 7, "'"} }); addRow("octal escape", "'\\o067'", { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, "\\o067"}, {TokenType::Char, 6, "'"} }); addRow("octal escape error", "'\\o8'", { {TokenType::Char, 0, "'\\"}, {TokenType::CharError, 2, "o"}, {TokenType::CharError, 3, "8"}, {TokenType::Char, 4, "'"} }); addRow("hexadecimal escape", "'\\x0678Ab'", { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, "\\x0678Ab"}, {TokenType::Char, 9, "'"} }); addRow("hexadecimal escape error", "'\\xg'", { {TokenType::Char, 0, "'\\"}, {TokenType::CharError, 2, "x"}, {TokenType::CharError, 3, "g"}, {TokenType::Char, 4, "'"} }); // ascii cntrl escapes (including wrong ones) for (char c = '!'; c <= '~'; ++c) { if (c == '\'') // is special because it also ends the string continue; const QChar qc(c); const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8(); const QString input = QString("'\\^%1'").arg(qc); if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']' || qc == '^' || qc == '_') { addRow(name.constData(), input, { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc}, {TokenType::Char, 4, "'"} }); } else { addRow(name.constData(), input, { {TokenType::Char, 0, "'\\"}, {TokenType::CharError, 2, "^"}, {TokenType::CharError, 3, qc}, {TokenType::Char, 4, "'"} }); } } addRow("ascii escape SOH", "'\\SOH'", { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, "\\SOH"}, {TokenType::Char, 5, "'"} }); addRow("ascii escape SO, too long", "'\\SON'", { {TokenType::Char, 0, "'"}, {TokenType::EscapeSequence, 1, "\\SO"}, {TokenType::CharError, 4, "N"}, {TokenType::Char, 5, "'"} }); addRow("ascii escape error", "'\\TON'", { {TokenType::Char, 0, "'\\"}, {TokenType::CharError, 2, "T"}, {TokenType::CharError, 3, "ON"}, {TokenType::Char, 5, "'"} }); } void tst_Tokenizer::character() { checkData(); } void tst_Tokenizer::number_data() { setupData(); addRow("decimal", "012345", { {TokenType::Integer, 0, "012345"} }); addRow("single digit decimal", "0", { {TokenType::Integer, 0, "0"} }); addRow("octal", "0o1234", { {TokenType::Integer, 0, "0o1234"} }); // this is a bit weird, but correct: octal 1 followed by decimal 8 addRow("number after octal", "0O18", { {TokenType::Integer, 0, "0O1"}, {TokenType::Integer, 3, "8"} }); addRow("not octal", "0o9", { {TokenType::Integer, 0, "0"}, {TokenType::Variable, 1, "o9"}, }); addRow("hexadecimal", "0x9fA", { {TokenType::Integer, 0, "0x9fA"} }); // hex number followed by identifier 'g' addRow("hexadecimal", "0X9fg", { {TokenType::Integer, 0, "0X9f"}, {TokenType::Variable, 4, "g"} }); // 0 followed by identifier addRow("decimal followed by identifier", "0z6", { {TokenType::Integer, 0, "0"}, {TokenType::Variable, 1, "z6"} }); addRow("float", "0123.45", { {TokenType::Float, 0, "0123.45"} }); addRow("decimal + operator '.'", "0123.", { {TokenType::Integer, 0, "0123"}, {TokenType::Operator, 4, "."} }); addRow("operator '.' + decimal", ".0123", { {TokenType::Operator, 0, "."}, {TokenType::Integer, 1, "0123"} }); addRow("without '.', with exp 'e'", "0123e45", { {TokenType::Float, 0, "0123e45"} }); addRow("without '.', with exp 'E'", "0123E45", { {TokenType::Float, 0, "0123E45"} }); addRow("without '.', with '+'", "0123e+45", { {TokenType::Float, 0, "0123e+45"} }); addRow("without '.', with '-'", "0123e-45", { {TokenType::Float, 0, "0123e-45"} }); addRow("without '.', with '+', missing decimal", "0123e+", { {TokenType::Integer, 0, "0123"}, {TokenType::Variable, 4, "e"}, {TokenType::Operator, 5, "+"} }); addRow("without '.', missing decimal", "0123e", { {TokenType::Integer, 0, "0123"}, {TokenType::Variable, 4, "e"} }); addRow("exp 'e'", "01.23e45", { {TokenType::Float, 0, "01.23e45"} }); addRow("exp 'E'", "01.23E45", { {TokenType::Float, 0, "01.23E45"} }); addRow("with '+'", "01.23e+45", { {TokenType::Float, 0, "01.23e+45"} }); addRow("with '-'", "01.23e-45", { {TokenType::Float, 0, "01.23e-45"} }); addRow("with '+', missing decimal", "01.23e+", { {TokenType::Float, 0, "01.23"}, {TokenType::Variable, 5, "e"}, {TokenType::Operator, 6, "+"} }); addRow("missing decimal", "01.23e", { {TokenType::Float, 0, "01.23"}, {TokenType::Variable, 5, "e"} }); } void tst_Tokenizer::number() { checkData(); } void tst_Tokenizer::keyword_data() { setupData(); addRow("data", "data", { {TokenType::Keyword, 0, "data"} }); addRow("not a qualified varid", "Foo.case", { {TokenType::Constructor, 0, "Foo"}, {TokenType::Operator, 3, "."}, {TokenType::Keyword, 4, "case"} }); addRow(":", ":", { {TokenType::Keyword, 0, ":"} }); addRow("->", "->", { {TokenType::Keyword, 0, "->"} }); addRow("not a qualified varsym", "Foo...", { {TokenType::Constructor, 0, "Foo"}, {TokenType::Operator, 3, "..."} }); } void tst_Tokenizer::keyword() { checkData(); } void tst_Tokenizer::variable_data() { setupData(); addRow("simple", "fOo_1'", { {TokenType::Variable, 0, "fOo_1'"} }); addRow("start with '_'", "_1", { {TokenType::Variable, 0, "_1"} }); addRow("not a keyword", "cases", { {TokenType::Variable, 0, "cases"} }); addRow("not a keyword 2", "qualified", { {TokenType::Variable, 0, "qualified"} }); addRow("not a keyword 3", "as", { {TokenType::Variable, 0, "as"} }); addRow("not a keyword 4", "hiding", { {TokenType::Variable, 0, "hiding"} }); addRow(".variable", ".foo", { {TokenType::Operator, 0, "."}, {TokenType::Variable, 1, "foo"} }); addRow("variable.", "foo.", { {TokenType::Variable, 0, "foo"}, {TokenType::Operator, 3, "."} }); addRow("variable.variable", "blah.foo", { {TokenType::Variable, 0, "blah"}, {TokenType::Operator, 4, "."}, {TokenType::Variable, 5, "foo"} }); addRow("qualified", "Blah.foo", { {TokenType::Variable, 0, "Blah.foo"} }); addRow("qualified2", "Goo.Blah.foo", { {TokenType::Variable, 0, "Goo.Blah.foo"} }); addRow("variable + op '..'", "foo..", { {TokenType::Variable, 0, "foo"}, {TokenType::Keyword, 3, ".."} }); addRow("variable + op '...'", "foo...", { {TokenType::Variable, 0, "foo"}, {TokenType::Operator, 3, "..."} }); } void tst_Tokenizer::variable() { checkData(); } void tst_Tokenizer::constructor_data() { setupData(); addRow("simple", "Foo", { {TokenType::Constructor, 0, "Foo"} }); addRow("qualified", "Foo.Bar", { {TokenType::Constructor, 0, "Foo.Bar"} }); addRow("followed by op '.'", "Foo.Bar.", { {TokenType::Constructor, 0, "Foo.Bar"}, {TokenType::Operator, 7, "."} }); } void tst_Tokenizer::constructor() { checkData(); } void tst_Tokenizer::op_data() { setupData(); addRow("simple", "+-=", { {TokenType::Operator, 0, "+-="} }); addRow("qualified", "Foo.+-=", { {TokenType::Operator, 0, "Foo.+-="} }); addRow("qualified '.'", "Foo..", { {TokenType::Operator, 0, "Foo.."} }); addRow("constructor plus op", "Foo+", { {TokenType::Constructor, 0, "Foo"}, {TokenType::Operator, 3, "+"} }); } void tst_Tokenizer::op() { checkData(); } QTEST_MAIN(tst_Tokenizer) #include "tst_tokenizer.moc"