/** * @file tokenize.cpp * This file breaks up the text stream into tokens or chunks. * * Each routine needs to set pc.len and pc.type. * * @author Ben Gardner * @license GPL v2+ */ #include "tokenize.h" #include "keywords.h" #include "prototypes.h" #include "punctuators.h" #include "unc_ctype.h" #include #include #define LE_COUNT(x) cpd.le_counts[static_cast(LE_ ## x)] constexpr static auto LCURRENT = LTOK; using namespace std; using namespace uncrustify; struct tok_info { tok_info() : last_ch(0) , idx(0) , row(1) , col(1) { } size_t last_ch; size_t idx; size_t row; size_t col; }; struct tok_ctx { tok_ctx(const deque &d) : data(d) { } //! save before trying to parse something that may fail void save() { save(s); } void save(tok_info &info) { info = c; } //! restore previous saved state void restore() { restore(s); } void restore(const tok_info &info) { c = info; } bool more() { return(c.idx < data.size()); } size_t peek() { return(more() ? data[c.idx] : 0); } size_t peek(size_t idx) { idx += c.idx; return((idx < data.size()) ? data[idx] : 0); } size_t get() { if (more()) { size_t ch = data[c.idx++]; switch (ch) { case '\t': log_rule_B("input_tab_size"); c.col = calc_next_tab_column(c.col, options::input_tab_size()); break; case '\n': if (c.last_ch != '\r') { c.row++; c.col = 1; } break; case '\r': c.row++; c.col = 1; break; default: c.col++; break; } c.last_ch = ch; return(ch); } return(0); } bool expect(size_t ch) { if (peek() == ch) { get(); return(true); } return(false); } const deque &data; tok_info c; //! current tok_info s; //! saved }; /** * Count the number of characters in a quoted string. * The next bit of text starts with a quote char " or ' or <. * Count the number of characters until the matching character. * * @param pc The structure to update, str is an input. * * @return Whether a string was parsed */ static bool parse_string(tok_ctx &ctx, Chunk &pc, size_t quote_idx, bool allow_escape); /** * Literal string, ends with single " * Two "" don't end the string. * * @param pc The structure to update, str is an input. * * @return Whether a string was parsed */ static bool parse_cs_string(tok_ctx &ctx, Chunk &pc); /** * VALA verbatim string, ends with three quotes (""") * * @param pc The structure to update, str is an input. */ static void parse_verbatim_string(tok_ctx &ctx, Chunk &pc); static bool tag_compare(const deque &d, size_t a_idx, size_t b_idx, size_t len); /** * Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)" * Newlines may be in the string. * * @param pc structure to update, str is an input. */ static bool parse_cr_string(tok_ctx &ctx, Chunk &pc, size_t q_idx); /** * Count the number of whitespace characters. * * @param pc The structure to update, str is an input. * * @return Whether whitespace was parsed */ static bool parse_whitespace(tok_ctx &ctx, Chunk &pc); /** * Called when we hit a backslash. * If there is nothing but whitespace until the newline, then this is a * backslash newline * * @param pc structure to update, str is an input */ static bool parse_bs_newline(tok_ctx &ctx, Chunk &pc); /** * Parses any number of tab or space chars followed by a newline. * Does not change pc.len if a newline isn't found. * This is not the same as parse_whitespace() because it only consumes until * a single newline is encountered. */ static bool parse_newline(tok_ctx &ctx); /** * PAWN #define is different than C/C++. * #define PATTERN REPLACEMENT_TEXT * The PATTERN may not contain a space or '[' or ']'. * A generic whitespace check should be good enough. * Do not change the pattern. * * @param pc structure to update, str is an input */ static void parse_pawn_pattern(tok_ctx &ctx, Chunk &pc, E_Token tt); static bool parse_ignored(tok_ctx &ctx, Chunk &pc); /** * Skips the next bit of whatever and returns the type of block. * * pc.str is the input text. * pc.len in the output length. * pc.type is the output type * pc.column is output column * * @param pc The structure to update, str is an input. * @param prev_pc The previous structure * * @return true/false - whether anything was parsed */ static bool parse_next(tok_ctx &ctx, Chunk &pc, const Chunk *prev_pc); /** * Parses all legal D string constants. * * Quoted strings: * r"Wysiwyg" # WYSIWYG string * x"hexstring" # Hexadecimal array * `Wysiwyg` # WYSIWYG string * 'char' # single character * "reg_string" # regular string * * Non-quoted strings: * \x12 # 1-byte hex constant * \u1234 # 2-byte hex constant * \U12345678 # 4-byte hex constant * \123 # octal constant * \& # named entity * \n # single character * * @param pc The structure to update, str is an input. * * @return Whether a string was parsed */ static bool d_parse_string(tok_ctx &ctx, Chunk &pc); /** * Figure of the length of the comment at text. * The next bit of text starts with a '/', so it might be a comment. * There are three types of comments: * - C comments that start with '/ *' and end with '* /' * - C++ comments that start with // * - D nestable comments '/+' '+/' * * @param pc The structure to update, str is an input. * * @return Whether a comment was parsed */ static bool parse_comment(tok_ctx &ctx, Chunk &pc); /** * Figure of the length of the code placeholder at text, if present. * This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>. * * @param pc The structure to update, str is an input. * * @return Whether a placeholder was parsed. */ static bool parse_code_placeholder(tok_ctx &ctx, Chunk &pc); /** * Parse any attached suffix, which may be a user-defined literal suffix. * If for a string, explicitly exclude common format and scan specifiers, ie, * PRIx32 and SCNx64. */ static void parse_suffix(tok_ctx &ctx, Chunk &pc, bool forstring); //! check if a symbol holds a boolean value static bool is_bin(int ch); static bool is_bin_(int ch); //! check if a symbol holds a octal value static bool is_oct(int ch); static bool is_oct_(int ch); //! check if a symbol holds a decimal value; static bool is_dec(int ch); static bool is_dec_(int ch); //! check if a symbol holds a hexadecimal value static bool is_hex(int ch); static bool is_hex_(int ch); /** * Count the number of characters in the number. * The next bit of text starts with a number (0-9 or '.'), so it is a number. * Count the number of characters in the number. * * This should cover all number formats for all languages. * Note that this is not a strict parser. It will happily parse numbers in * an invalid format. * * For example, only D allows underscores in the numbers, but they are * allowed in all formats. * * @param[in,out] pc The structure to update, str is an input. * * @return Whether a number was parsed */ static bool parse_number(tok_ctx &ctx, Chunk &pc); static bool d_parse_string(tok_ctx &ctx, Chunk &pc) { size_t ch = ctx.peek(); if ( ch == '"' || ch == '\'') { return(parse_string(ctx, pc, 0, true)); } if (ch == '`') { return(parse_string(ctx, pc, 0, false)); } if ( ( ch == 'r' || ch == 'x') && ctx.peek(1) == '"') { return(parse_string(ctx, pc, 1, false)); } if (ch != '\\') { return(false); } ctx.save(); int cnt; pc.Str().clear(); while (ctx.peek() == '\\') { pc.Str().append(ctx.get()); // Check for end of file switch (ctx.peek()) { case 'x': // \x HexDigit HexDigit cnt = 3; while (cnt--) { pc.Str().append(ctx.get()); } break; case 'u': // \u HexDigit (x4) cnt = 5; while (cnt--) { pc.Str().append(ctx.get()); } break; case 'U': // \U HexDigit (x8) cnt = 9; while (cnt--) { pc.Str().append(ctx.get()); } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': // handle up to 3 octal digits pc.Str().append(ctx.get()); ch = ctx.peek(); if ( (ch >= '0') && (ch <= '7')) { pc.Str().append(ctx.get()); ch = ctx.peek(); if ( (ch >= '0') && (ch <= '7')) { pc.Str().append(ctx.get()); } } break; case '&': // \& NamedCharacterEntity ; pc.Str().append(ctx.get()); while (unc_isalpha(ctx.peek())) { pc.Str().append(ctx.get()); } if (ctx.peek() == ';') { pc.Str().append(ctx.get()); } break; default: // Everything else is a single character pc.Str().append(ctx.get()); break; } // switch } if (pc.GetStr().size() < 1) { ctx.restore(); return(false); } pc.SetType(CT_STRING); return(true); } // d_parse_string #if 0 //! A string-in-string search. Like strstr() with a haystack length. static const char *str_search(const char *needle, const char *haystack, int haystack_len) { int needle_len = strlen(needle); while (haystack_len-- >= needle_len) { if (memcmp(needle, haystack, needle_len) == 0) { return(haystack); } haystack++; } return(NULL); } #endif static bool parse_comment(tok_ctx &ctx, Chunk &pc) { bool is_d = language_is_set(LANG_D); bool is_cs = language_is_set(LANG_CS); size_t d_level = 0; // does this start with '/ /' or '/ *' or '/ +' (d) if ( (ctx.peek() != '/') || ( (ctx.peek(1) != '*') && (ctx.peek(1) != '/') && ( (ctx.peek(1) != '+') || !is_d))) { return(false); } ctx.save(); // account for opening two chars pc.Str() = ctx.get(); // opening '/' size_t ch = ctx.get(); pc.Str().append(ch); // second char if (ch == '/') { pc.SetType(CT_COMMENT_CPP); while (true) { int bs_cnt = 0; while (ctx.more()) { ch = ctx.peek(); if ( (ch == '\r') || (ch == '\n')) { break; } if ( (ch == '\\') && !is_cs) // backslashes aren't special in comments in C# { bs_cnt++; } else { bs_cnt = 0; } pc.Str().append(ctx.get()); } /* * If we hit an odd number of backslashes right before the newline, * then we keep going. */ if ( ((bs_cnt & 1) == 0) || !ctx.more()) { break; } if (ctx.peek() == '\r') { pc.Str().append(ctx.get()); } if (ctx.peek() == '\n') { pc.Str().append(ctx.get()); } pc.SetNlCount(pc.GetNlCount() + 1); cpd.did_newline = true; } } else if (!ctx.more()) { // unexpected end of file ctx.restore(); return(false); } else if (ch == '+') { pc.SetType(CT_COMMENT); d_level++; while ( d_level > 0 && ctx.more()) { if ( (ctx.peek() == '+') && (ctx.peek(1) == '/')) { pc.Str().append(ctx.get()); // store the '+' pc.Str().append(ctx.get()); // store the '/' d_level--; continue; } if ( (ctx.peek() == '/') && (ctx.peek(1) == '+')) { pc.Str().append(ctx.get()); // store the '/' pc.Str().append(ctx.get()); // store the '+' d_level++; continue; } ch = ctx.get(); pc.Str().append(ch); if ( (ch == '\n') || (ch == '\r')) { pc.SetType(CT_COMMENT_MULTI); pc.SetNlCount(pc.GetNlCount() + 1); if (ch == '\r') { if (ctx.peek() == '\n') { ++LE_COUNT(CRLF); pc.Str().append(ctx.get()); // store the '\n' } else { ++LE_COUNT(CR); } } else { ++LE_COUNT(LF); } } } } else // must be '/ *' { pc.SetType(CT_COMMENT); while (ctx.more()) { if ( (ctx.peek() == '*') && (ctx.peek(1) == '/')) { pc.Str().append(ctx.get()); // store the '*' pc.Str().append(ctx.get()); // store the '/' tok_info ss; ctx.save(ss); size_t oldsize = pc.GetStr().size(); // If there is another C comment right after this one, combine them while ( (ctx.peek() == ' ') || (ctx.peek() == '\t')) { pc.Str().append(ctx.get()); } if ( (ctx.peek() != '/') || (ctx.peek(1) != '*')) { // undo the attempt to join ctx.restore(ss); pc.Str().resize(oldsize); break; } } ch = ctx.get(); pc.Str().append(ch); if ( (ch == '\n') || (ch == '\r')) { pc.SetType(CT_COMMENT_MULTI); pc.SetNlCount(pc.GetNlCount() + 1); if (ch == '\r') { if (ctx.peek() == '\n') { ++LE_COUNT(CRLF); pc.Str().append(ctx.get()); // store the '\n' } else { ++LE_COUNT(CR); } } else { ++LE_COUNT(LF); } } } } if (cpd.unc_off) { bool found_enable_marker = (find_enable_processing_comment_marker(pc.GetStr()) >= 0); if (found_enable_marker) { const auto &ontext = options::enable_processing_cmt(); LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n", __func__, __LINE__, ontext.c_str(), pc.GetOrigLine()); cpd.unc_off = false; } } else { auto position_disable_processing_cmt = find_disable_processing_comment_marker(pc.GetStr()); bool found_disable_marker = (position_disable_processing_cmt >= 0); if (found_disable_marker) { /** * the user may wish to disable processing part of a multiline comment, * in which case we'll handle at a late time. Check to see if processing * is re-enabled elsewhere in this comment */ auto position_enable_processing_cmt = find_enable_processing_comment_marker(pc.GetStr()); if (position_enable_processing_cmt < position_disable_processing_cmt) { const auto &offtext = options::disable_processing_cmt(); LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n", __func__, __LINE__, offtext.c_str(), pc.GetOrigLine()); cpd.unc_off = true; // Issue #842 cpd.unc_off_used = true; } } } return(true); } // parse_comment static bool parse_code_placeholder(tok_ctx &ctx, Chunk &pc) { if ( (ctx.peek() != '<') || (ctx.peek(1) != '#')) { return(false); } ctx.save(); // account for opening two chars '<#' pc.Str() = ctx.get(); pc.Str().append(ctx.get()); // grab everything until '#>', fail if not found. size_t last1 = 0; while (ctx.more()) { size_t last2 = last1; last1 = ctx.get(); pc.Str().append(last1); if ( (last2 == '#') && (last1 == '>')) { pc.SetType(CT_WORD); return(true); } } ctx.restore(); return(false); } static void parse_suffix(tok_ctx &ctx, Chunk &pc, bool forstring = false) { if (CharTable::IsKw1(ctx.peek())) { size_t slen = 0; size_t oldsize = pc.GetStr().size(); // don't add the suffix if we see L" or L' or S" size_t p1 = ctx.peek(); size_t p2 = ctx.peek(1); if ( forstring && ( ( (p1 == 'L') && ( (p2 == '"') || (p2 == '\''))) || ( (p1 == 'S') && (p2 == '"')))) { return; } tok_info ss; ctx.save(ss); while ( ctx.more() && CharTable::IsKw2(ctx.peek())) { slen++; pc.Str().append(ctx.get()); } if ( forstring && slen >= 4 && ( pc.GetStr().startswith("PRI", oldsize) || pc.GetStr().startswith("SCN", oldsize))) { ctx.restore(ss); pc.Str().resize(oldsize); } } } static bool is_bin(int ch) { return( (ch == '0') || (ch == '1')); } static bool is_bin_(int ch) { return( is_bin(ch) || ch == '_' || ch == '\''); } static bool is_oct(int ch) { return( (ch >= '0') && (ch <= '7')); } static bool is_oct_(int ch) { return( is_oct(ch) || ch == '_' || ch == '\''); } static bool is_dec(int ch) { return( (ch >= '0') && (ch <= '9')); } static bool is_dec_(int ch) { // number separators: JAVA: "_", C++14: "'" return( is_dec(ch) || (ch == '_') || (ch == '\'')); } static bool is_hex(int ch) { return( ( (ch >= '0') && (ch <= '9')) || ( (ch >= 'a') && (ch <= 'f')) || ( (ch >= 'A') && (ch <= 'F'))); } static bool is_hex_(int ch) { return( is_hex(ch) || ch == '_' || ch == '\''); } static bool parse_number(tok_ctx &ctx, Chunk &pc) { /* * A number must start with a digit or a dot, followed by a digit * (signs handled elsewhere) */ if ( !is_dec(ctx.peek()) && ( (ctx.peek() != '.') || !is_dec(ctx.peek(1)))) { return(false); } bool is_float = (ctx.peek() == '.'); if ( is_float && (ctx.peek(1) == '.')) // make sure it isn't '..' { return(false); } /* * Check for Hex, Octal, or Binary * Note that only D, C++14 and Pawn support binary * Fixes the issue # 1591 * In c# the numbers starting with 0 are not treated as octal numbers. */ bool did_hex = false; if ( ctx.peek() == '0' && !language_is_set(LANG_CS)) { size_t ch; Chunk pc_temp; pc.Str().append(ctx.get()); // store the '0' pc_temp.Str().append('0'); // MS constant might have an "h" at the end. Look for it ctx.save(); while ( ctx.more() && CharTable::IsKw2(ctx.peek())) { ch = ctx.get(); pc_temp.Str().append(ch); } ch = pc_temp.GetStr()[pc_temp.Len() - 1]; ctx.restore(); LOG_FMT(LGUY, "%s(%d): pc_temp:%s\n", __func__, __LINE__, pc_temp.Text()); if (ch == 'h') // TODO can we combine this in analyze_character { // we have an MS hexadecimal number with "h" at the end LOG_FMT(LGUY, "%s(%d): MS hexadecimal number\n", __func__, __LINE__); did_hex = true; do { pc.Str().append(ctx.get()); // store the rest } while (is_hex_(ctx.peek())); pc.Str().append(ctx.get()); // store the h LOG_FMT(LGUY, "%s(%d): pc:%s\n", __func__, __LINE__, pc.Text()); } else { switch (unc_toupper(ctx.peek())) { case 'X': // hex did_hex = true; do { pc.Str().append(ctx.get()); // store the 'x' and then the rest } while (is_hex_(ctx.peek())); break; case 'B': // binary do { pc.Str().append(ctx.get()); // store the 'b' and then the rest } while (is_bin_(ctx.peek())); break; case '0': // octal or decimal case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': do { pc.Str().append(ctx.get()); } while (is_oct_(ctx.peek())); break; default: // either just 0 or 0.1 or 0UL, etc break; } // switch } } else { // Regular int or float while (is_dec_(ctx.peek())) { pc.Str().append(ctx.get()); } } // Check if we stopped on a decimal point & make sure it isn't '..' if ( (ctx.peek() == '.') && (ctx.peek(1) != '.')) { // Issue #1265, 5.clamp() tok_info ss; ctx.save(ss); while ( ctx.more() && CharTable::IsKw2(ctx.peek(1))) { // skip characters to check for paren open ctx.get(); } if (ctx.peek(1) == '(') { ctx.restore(ss); pc.SetType(CT_NUMBER); return(true); } else { ctx.restore(ss); } pc.Str().append(ctx.get()); is_float = true; if (did_hex) { while (is_hex_(ctx.peek())) { pc.Str().append(ctx.get()); } } else { while (is_dec_(ctx.peek())) { pc.Str().append(ctx.get()); } } } /* * Check exponent * Valid exponents per language (not that it matters): * C/C++/D/Java: eEpP * C#/Pawn: eE */ size_t tmp = unc_toupper(ctx.peek()); if ( (tmp == 'E') || (tmp == 'P')) { is_float = true; pc.Str().append(ctx.get()); if ( (ctx.peek() == '+') || (ctx.peek() == '-')) { pc.Str().append(ctx.get()); } while (is_dec_(ctx.peek())) { pc.Str().append(ctx.get()); } } /* * Check the suffixes * Valid suffixes per language (not that it matters): * Integer Float * C/C++: uUlL64 lLfF * C#: uUlL fFdDMm * D: uUL ifFL * Java: lL fFdD * Pawn: (none) (none) * * Note that i, f, d, and m only appear in floats. */ while (1) { size_t tmp2 = unc_toupper(ctx.peek()); if ( (tmp2 == 'I') || (tmp2 == 'F') || (tmp2 == 'D') || (tmp2 == 'M')) { is_float = true; } else if ( (tmp2 != 'L') && (tmp2 != 'U')) { break; } pc.Str().append(ctx.get()); } // skip the Microsoft-specific '32' and '64' suffix if ( ( (ctx.peek() == '3') && (ctx.peek(1) == '2')) || ( (ctx.peek() == '6') && (ctx.peek(1) == '4'))) { pc.Str().append(ctx.get()); pc.Str().append(ctx.get()); } pc.SetType(is_float ? CT_NUMBER_FP : CT_NUMBER); /* * If there is anything left, then we are probably dealing with garbage or * some sick macro junk. Eat it. */ parse_suffix(ctx, pc); return(true); } // parse_number static bool parse_string(tok_ctx &ctx, Chunk &pc, size_t quote_idx, bool allow_escape) { log_rule_B("string_escape_char"); const size_t escape_char = options::string_escape_char(); log_rule_B("string_escape_char2"); const size_t escape_char2 = options::string_escape_char2(); log_rule_B("string_replace_tab_chars"); const bool should_escape_tabs = ( allow_escape && options::string_replace_tab_chars() && language_is_set(LANG_ALLC)); pc.Str().clear(); while (quote_idx-- > 0) { pc.Str().append(ctx.get()); } pc.SetType(CT_STRING); const size_t termination_character = CharTable::Get(ctx.peek()) & 0xff; pc.Str().append(ctx.get()); // store the " bool escaped = false; while (ctx.more()) { const size_t ch = ctx.get(); // convert char 9 (\t) to chars \t if ( (ch == '\t') && should_escape_tabs) { const size_t lastcol = ctx.c.col - 1; ctx.c.col = lastcol + 2; pc.Str().append(escape_char); pc.Str().append('t'); continue; } pc.Str().append(ch); if (ch == '\n') { pc.SetNlCount(pc.GetNlCount() + 1); pc.SetType(CT_STRING_MULTI); } else if ( ch == '\r' && ctx.peek() != '\n') { pc.Str().append(ctx.get()); pc.SetNlCount(pc.GetNlCount() + 1); pc.SetType(CT_STRING_MULTI); } // if last char in prev loop was escaped the one in the current loop isn't if (escaped) { escaped = false; continue; } // see if the current char is a escape char if (allow_escape) { if (ch == escape_char) { escaped = (escape_char != 0); continue; } if ( ch == escape_char2 && (ctx.peek() == termination_character)) { escaped = allow_escape; continue; } } if (ch == termination_character) { break; } } parse_suffix(ctx, pc, true); return(true); } // parse_string enum cs_string_t { CS_STRING_NONE = 0, CS_STRING_STRING = 1 << 0, // is any kind of string CS_STRING_VERBATIM = 1 << 1, // @"" style string CS_STRING_INTERPOLATED = 1 << 2, // $"" or $@"" style string }; static cs_string_t operator|=(cs_string_t &value, cs_string_t other) { return(value = static_cast(value | other)); } static cs_string_t parse_cs_string_start(tok_ctx &ctx, Chunk &pc) { cs_string_t stringType = CS_STRING_NONE; int offset = 0; if (ctx.peek(offset) == '$') { stringType |= CS_STRING_INTERPOLATED; ++offset; } if (ctx.peek(offset) == '@') { stringType |= CS_STRING_VERBATIM; ++offset; } if (ctx.peek(offset) == '"') { stringType |= CS_STRING_STRING; pc.SetType(CT_STRING); for (int i = 0; i <= offset; ++i) { pc.Str().append(ctx.get()); } } else { stringType = CS_STRING_NONE; } return(stringType); } // parse_cs_string_start struct CsStringParseState { cs_string_t type; int braceDepth; CsStringParseState(cs_string_t stringType) { type = stringType; braceDepth = 0; } }; /** * C# strings are complex enough (mostly due to interpolation and nesting) that they need a custom parser. */ static bool parse_cs_string(tok_ctx &ctx, Chunk &pc) { cs_string_t stringType = parse_cs_string_start(ctx, pc); if (stringType == 0) { return(false); } // an interpolated string can contain {expressions}, which can contain $"strings", which in turn // can contain {expressions}, so we must track both as they are interleaved, in order to properly // parse the outermost string. std::stack parseState; // each entry is a nested string parseState.push(CsStringParseState(stringType)); log_rule_B("string_replace_tab_chars"); bool should_escape_tabs = options::string_replace_tab_chars(); while (ctx.more()) { if (parseState.top().braceDepth > 0) { // all we can do when in an expr is look for expr close with }, or a new string opening. must do this first // so we can peek and potentially consume chars for new string openings, before the ch=get() happens later, // which is needed for newline processing. if (ctx.peek() == '}') { pc.Str().append(ctx.get()); if (ctx.peek() == '}') { pc.Str().append(ctx.get()); // in interpolated string, `}}` is escape'd `}` } else { --parseState.top().braceDepth; } continue; } stringType = parse_cs_string_start(ctx, pc); if (stringType) { parseState.push(CsStringParseState(stringType)); continue; } } int lastcol = ctx.c.col; int ch = ctx.get(); pc.Str().append(ch); if (ch == '\n') { pc.SetType(CT_STRING_MULTI); pc.SetNlCount(pc.GetNlCount() + 1); } else if (ch == '\r') { pc.SetType(CT_STRING_MULTI); } else if (parseState.top().braceDepth > 0) { // do nothing. if we're in a brace, we only want the newline handling, and skip the rest. } else if ( (ch == '\t') && should_escape_tabs) { if (parseState.top().type & CS_STRING_VERBATIM) { if (!cpd.warned_unable_string_replace_tab_chars) { cpd.warned_unable_string_replace_tab_chars = true; log_rule_B("warn_level_tabs_found_in_verbatim_string_literals"); log_sev_t warnlevel = (log_sev_t)options::warn_level_tabs_found_in_verbatim_string_literals(); /* * a tab char can't be replaced with \\t because escapes don't * work in here-strings. best we can do is warn. */ LOG_FMT(warnlevel, "%s(%d): %s: orig line is %zu, orig col is %zu, Detected non-replaceable tab char in literal string\n", __func__, __LINE__, cpd.filename.c_str(), pc.GetOrigLine(), pc.GetOrigCol()); LOG_FMT(warnlevel, "%s(%d): Warning is given if doing tab-to-\\t replacement and we have found one in a C# verbatim string literal.\n", __func__, __LINE__); if (warnlevel < LWARN) { // TODO: replace the code ?? cpd.error_count++; } } } else { ctx.c.col = lastcol + 2; pc.Str().pop_back(); // remove \t pc.Str().append("\\t"); continue; } } else if ( ch == '\\' && !(parseState.top().type & CS_STRING_VERBATIM)) { // catch escaped quote in order to avoid ending string (but also must handle \\ to avoid accidental 'escape' seq of `\\"`) if ( ctx.peek() == '"' || ctx.peek() == '\\') { pc.Str().append(ctx.get()); } } else if (ch == '"') { if ( (parseState.top().type & CS_STRING_VERBATIM) && (ctx.peek() == '"')) { // in verbatim string, `""` is escape'd `"` pc.Str().append(ctx.get()); } else { // end of string parseState.pop(); if (parseState.empty()) { break; } } } else if (parseState.top().type & CS_STRING_INTERPOLATED) { if (ch == '{') { if (ctx.peek() == '{') { pc.Str().append(ctx.get()); // in interpolated string, `{{` is escape'd `{` } else { ++parseState.top().braceDepth; } } } } return(true); } // parse_cs_string static void parse_verbatim_string(tok_ctx &ctx, Chunk &pc) { pc.SetType(CT_STRING); // consume the initial """ pc.Str() = ctx.get(); pc.Str().append(ctx.get()); pc.Str().append(ctx.get()); // go until we hit a zero (end of file) or a """ while (ctx.more()) { size_t ch = ctx.get(); pc.Str().append(ch); if ( (ch == '"') && (ctx.peek() == '"') && (ctx.peek(1) == '"')) { pc.Str().append(ctx.get()); pc.Str().append(ctx.get()); break; } if ( (ch == '\n') || (ch == '\r')) { pc.SetType(CT_STRING_MULTI); pc.SetNlCount(pc.GetNlCount() + 1); } } } static bool tag_compare(const deque &d, size_t a_idx, size_t b_idx, size_t len) { if (a_idx != b_idx) { while (len-- > 0) { if (d[a_idx] != d[b_idx]) { return(false); } } } return(true); } static bool parse_cr_string(tok_ctx &ctx, Chunk &pc, size_t q_idx) { size_t tag_idx = ctx.c.idx + q_idx + 1; size_t tag_len = 0; ctx.save(); // Copy the prefix + " to the string pc.Str().clear(); int cnt = q_idx + 1; while (cnt--) { pc.Str().append(ctx.get()); } // Add the tag and get the length of the tag while ( ctx.more() && (ctx.peek() != '(')) { tag_len++; pc.Str().append(ctx.get()); } if (ctx.peek() != '(') { ctx.restore(); return(false); } pc.SetType(CT_STRING); while (ctx.more()) { if ( (ctx.peek() == ')') && (ctx.peek(tag_len + 1) == '"') && tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len)) { cnt = tag_len + 2; // for the )" while (cnt--) { pc.Str().append(ctx.get()); } parse_suffix(ctx, pc); return(true); } if (ctx.peek() == '\n') { pc.Str().append(ctx.get()); pc.SetNlCount(pc.GetNlCount() + 1); pc.SetType(CT_STRING_MULTI); } else { pc.Str().append(ctx.get()); } } ctx.restore(); return(false); } // parse_cr_string /** * Count the number of characters in a word. * The first character is already valid for a keyword * * @param pc The structure to update, str is an input. * @return Whether a word was parsed (always true) */ static bool parse_word(tok_ctx &ctx, Chunk &pc, bool skipcheck) { static unc_text intr_txt("@interface"); // The first character is already valid pc.Str().clear(); pc.Str().append(ctx.get()); while (ctx.more()) { size_t ch = ctx.peek(); if (CharTable::IsKw2(ch)) { pc.Str().append(ctx.get()); } else if ( (ch == '\\') && (unc_tolower(ctx.peek(1)) == 'u')) { pc.Str().append(ctx.get()); pc.Str().append(ctx.get()); skipcheck = true; } else { break; } // HACK: Non-ASCII character are only allowed in identifiers if (ch > 0x7f) { skipcheck = true; } } pc.SetType(CT_WORD); if (skipcheck) { return(true); } // Detect pre-processor functions now if ( cpd.in_preproc == CT_PP_DEFINE && cpd.preproc_ncnl_count == 1) { if (ctx.peek() == '(') { pc.SetType(CT_MACRO_FUNC); } else { pc.SetType(CT_MACRO); log_rule_B("pp_ignore_define_body"); if (options::pp_ignore_define_body()) { /* * We are setting the PP_IGNORE preproc state because the following * chunks are part of the macro body and will have to be ignored. */ cpd.in_preproc = CT_PP_IGNORE; } } } else { // '@interface' is reserved, not an interface itself if ( language_is_set(LANG_JAVA) && pc.GetStr().startswith("@") && !pc.GetStr().equals(intr_txt)) { pc.SetType(CT_ANNOTATION); } else { // Turn it into a keyword now // Issue #1460 will return "COMMENT_CPP" pc.SetType(find_keyword_type(pc.Text(), pc.GetStr().size())); /* Special pattern: if we're trying to redirect a preprocessor directive to PP_IGNORE, * then ensure we're actually part of a preprocessor before doing the swap, or we'll * end up with a function named 'define' as PP_IGNORE. This is necessary because with * the config 'set' feature, there's no way to do a pair of tokens as a word * substitution. */ if ( pc.GetType() == CT_PP_IGNORE && !cpd.in_preproc) { pc.SetType(find_keyword_type(pc.Text(), pc.GetStr().size())); } else if (pc.GetType() == CT_COMMENT_CPP) // Issue #1460 { size_t ch; bool is_cs = language_is_set(LANG_CS); // read until EOL while (true) { int bs_cnt = 0; while (ctx.more()) { ch = ctx.peek(); if ( (ch == '\r') || (ch == '\n')) { break; } if ( (ch == '\\') && !is_cs) // backslashes aren't special in comments in C# { bs_cnt++; } else { bs_cnt = 0; } pc.Str().append(ctx.get()); } /* * If we hit an odd number of backslashes right before the newline, * then we keep going. */ if ( ((bs_cnt & 1) == 0) || !ctx.more()) { break; } if (ctx.peek() == '\r') { pc.Str().append(ctx.get()); } if (ctx.peek() == '\n') { pc.Str().append(ctx.get()); } pc.SetNlCount(pc.GetNlCount() + 1); cpd.did_newline = true; } // Store off the end column pc.SetOrigColEnd(ctx.c.col); } } } return(true); } // parse_word static size_t parse_attribute_specifier_sequence(tok_ctx &ctx) { size_t nested = 0; size_t offset = 0; size_t parens = 0; auto ch1 = ctx.peek(offset++); while (ch1) { auto ch2 = ctx.peek(offset++); while ( ch2 == ' ' || ch2 == '\n' || ch2 == '\r' || ch2 == '\t') { ch2 = ctx.peek(offset++); } if ( nested == 0 && ch2 != '[') { break; } if (ch1 == '(') { ++parens; ch1 = ch2; continue; } if (ch1 == ')') { if (parens == 0) { break; } --parens; ch1 = ch2; continue; } if ( ch1 != '[' && ch1 != ']') { ch1 = ch2; continue; } if (ch2 != ch1) { if (parens == 0) { break; } ch1 = ch2; continue; } if (ch1 == '[') { if ( nested != 0 && parens == 0) { break; } ++nested; } else if (--nested == 0) { return(offset); } ch1 = ctx.peek(offset++); } return(0); } // parse_attribute_specifier_sequence static bool extract_attribute_specifier_sequence(tok_ctx &ctx, Chunk &pc, size_t length) { pc.Str().clear(); while (length--) { pc.Str().append(ctx.get()); } pc.SetType(CT_ATTRIBUTE); return(true); } // extract_attribute_specifier_sequence static bool parse_whitespace(tok_ctx &ctx, Chunk &pc) { size_t nl_count = 0; size_t ch = 0; // REVISIT: use a better whitespace detector? while ( ctx.more() && unc_isspace(ctx.peek())) { int lastcol = ctx.c.col; ch = ctx.get(); // throw away the whitespace char switch (ch) { case '\r': if (ctx.expect('\n')) { // CRLF ending ++LE_COUNT(CRLF); } else { // CR ending ++LE_COUNT(CR); } nl_count++; pc.SetOrigPrevSp(0); break; case '\n': // LF ending ++LE_COUNT(LF); nl_count++; pc.SetOrigPrevSp(0); break; case '\t': pc.SetOrigPrevSp(pc.GetOrigPrevSp() + ctx.c.col - lastcol); break; case ' ': pc.SetOrigPrevSp(pc.GetOrigPrevSp() + 1); break; default: break; } } if (ch != 0) { pc.Str().clear(); pc.SetType(nl_count ? CT_NEWLINE : CT_WHITESPACE); pc.SetNlCount(nl_count); pc.SetAfterTab((ctx.c.last_ch == '\t')); return(true); } return(false); } // parse_whitespace static bool parse_bs_newline(tok_ctx &ctx, Chunk &pc) { ctx.save(); ctx.get(); // skip the '\' size_t ch; while ( ctx.more() && unc_isspace(ch = ctx.peek())) { ctx.get(); if ( (ch == '\r') || (ch == '\n')) { if (ch == '\r') { ctx.expect('\n'); } pc.SetType(CT_NL_CONT); pc.Str() = "\\"; pc.SetNlCount(1); return(true); } } ctx.restore(); return(false); } static bool parse_newline(tok_ctx &ctx) { ctx.save(); // Eat whitespace while ( (ctx.peek() == ' ') || (ctx.peek() == '\t')) { ctx.get(); } if ( (ctx.peek() == '\r') || (ctx.peek() == '\n')) { if (!ctx.expect('\n')) { ctx.get(); ctx.expect('\n'); } return(true); } ctx.restore(); return(false); } static void parse_pawn_pattern(tok_ctx &ctx, Chunk &pc, E_Token tt) { pc.Str().clear(); pc.SetType(tt); while (!unc_isspace(ctx.peek())) { // end the pattern on an escaped newline if (ctx.peek() == '\\') { size_t ch = ctx.peek(1); if ( (ch == '\n') || (ch == '\r')) { break; } } pc.Str().append(ctx.get()); } } static bool parse_off_newlines(tok_ctx &ctx, Chunk &pc) { size_t nl_count = 0; // Parse off newlines/blank lines while (parse_newline(ctx)) { nl_count++; } if (nl_count > 0) { pc.SetNlCount(nl_count); pc.SetType(CT_NEWLINE); return(true); } return(false); } static bool parse_macro(tok_ctx &ctx, Chunk &pc, const Chunk *prev_pc) { if (parse_off_newlines(ctx, pc)) { return(true); } if (parse_comment(ctx, pc)) // allow CT_COMMENT_MULTI within macros { return(true); } ctx.save(); pc.Str().clear(); if (prev_pc == nullptr) { return(false); } bool continued = ( prev_pc->Is(CT_NL_CONT) || prev_pc->Is(CT_COMMENT_MULTI)); while (ctx.more()) { size_t pk = ctx.peek(), pk1 = ctx.peek(1); bool nl = ( pk == '\n' || pk == '\r'); bool nl_cont = ( pk == '\\' && ( pk1 == '\n' || pk1 == '\r')); if ( ( nl_cont || ( continued && nl)) && pc.GetStr().size() > 0) { pc.SetType(CT_PP_IGNORE); return(true); } else if (nl) { break; } pc.Str().append(ctx.get()); } pc.Str().clear(); ctx.restore(); return(false); } // parse_macro static bool parse_ignored(tok_ctx &ctx, Chunk &pc) { if (parse_off_newlines(ctx, pc)) { return(true); } // See if the UO_enable_processing_cmt or #pragma endasm / #endasm text is on this line ctx.save(); pc.Str().clear(); while ( ctx.more() && (ctx.peek() != '\r') && (ctx.peek() != '\n')) { pc.Str().append(ctx.get()); } if (pc.GetStr().size() == 0) { // end of file? return(false); } // HACK: turn on if we find '#endasm' or '#pragma' and 'endasm' separated by blanks if ( ( ( (pc.GetStr().find("#pragma ") >= 0) || (pc.GetStr().find("#pragma ") >= 0)) && ( (pc.GetStr().find(" endasm") >= 0) || (pc.GetStr().find(" endasm") >= 0))) || (pc.GetStr().find("#endasm") >= 0)) { cpd.unc_off = false; ctx.restore(); pc.Str().clear(); return(false); } // Note that we aren't actually making sure this is in a comment, yet log_rule_B("enable_processing_cmt"); const auto &ontext = options::enable_processing_cmt(); if (!ontext.empty()) { bool found_enable_pattern = false; if ( ontext != UNCRUSTIFY_ON_TEXT && options::processing_cmt_as_regex()) { std::wstring pc_wstring(pc.GetStr().get().cbegin(), pc.GetStr().get().cend()); std::wregex criteria(std::wstring(ontext.cbegin(), ontext.cend())); found_enable_pattern = std::regex_search(pc_wstring.cbegin(), pc_wstring.cend(), criteria); } else { found_enable_pattern = (pc.GetStr().find(ontext.c_str()) >= 0); } if (!found_enable_pattern) { pc.SetType(CT_IGNORED); return(true); } } ctx.restore(); // parse off whitespace leading to the comment if (parse_whitespace(ctx, pc)) { pc.SetType(CT_IGNORED); return(true); } // Look for the ending comment and let it pass if ( parse_comment(ctx, pc) && !cpd.unc_off) { return(true); } // Reset the chunk & scan to until a newline pc.Str().clear(); while ( ctx.more() && (ctx.peek() != '\r') && (ctx.peek() != '\n')) { pc.Str().append(ctx.get()); } if (pc.GetStr().size() > 0) { pc.SetType(CT_IGNORED); return(true); } return(false); } // parse_ignored static bool parse_next(tok_ctx &ctx, Chunk &pc, const Chunk *prev_pc) { if (!ctx.more()) { return(false); } // Save off the current column pc.SetType(CT_NONE); pc.SetOrigLine(ctx.c.row); pc.SetColumn(ctx.c.col); pc.SetOrigCol(ctx.c.col); pc.SetNlCount(0); pc.SetFlags(PCF_NONE); // If it is turned off, we put everything except newlines into CT_UNKNOWN if (cpd.unc_off) { if (parse_ignored(ctx, pc)) { return(true); } } log_rule_B("disable_processing_nl_cont"); // Parse macro blocks if (options::disable_processing_nl_cont()) { if (parse_macro(ctx, pc, prev_pc)) { return(true); } } // Parse whitespace if (parse_whitespace(ctx, pc)) { return(true); } // Handle unknown/unhandled preprocessors if ( cpd.in_preproc > CT_PP_BODYCHUNK && cpd.in_preproc <= CT_PP_OTHER) { pc.Str().clear(); tok_info ss; ctx.save(ss); // Chunk to a newline or comment pc.SetType(CT_PREPROC_BODY); size_t last = 0; while (ctx.more()) { size_t ch = ctx.peek(); // Fix for issue #1752 // Ignoring extra spaces after ' \ ' for preproc body continuations if ( last == '\\' && ch == ' ') { ctx.get(); continue; } if ( (ch == '\n') || (ch == '\r')) { // Back off if this is an escaped newline if (last == '\\') { ctx.restore(ss); pc.Str().pop_back(); } break; } // Quit on a C or C++ comment start Issue #1966 if ( (ch == '/') && ( (ctx.peek(1) == '/') || (ctx.peek(1) == '*'))) { break; } last = ch; ctx.save(ss); pc.Str().append(ctx.get()); } if (pc.GetStr().size() > 0) { return(true); } } // Detect backslash-newline if ( (ctx.peek() == '\\') && parse_bs_newline(ctx, pc)) { return(true); } // Parse comments if (parse_comment(ctx, pc)) { return(true); } // Parse code placeholders if (parse_code_placeholder(ctx, pc)) { return(true); } if (language_is_set(LANG_CS)) { if (parse_cs_string(ctx, pc)) { return(true); } } if (language_is_set(LANG_CS | LANG_VALA)) { // check for non-keyword identifiers such as @if @switch, etc // Vala also allows numeric identifiers if prefixed with '@' if ( ctx.peek() == '@' && ( CharTable::IsKw1(ctx.peek(1)) || ( language_is_set(LANG_VALA) && CharTable::IsKw2(ctx.peek(1))))) { parse_word(ctx, pc, true); return(true); } } // handle VALA """ strings """ if ( language_is_set(LANG_VALA) && (ctx.peek() == '"') && (ctx.peek(1) == '"') && (ctx.peek(2) == '"')) { parse_verbatim_string(ctx, pc); return(true); } /* * handle C++(11) string/char literal prefixes u8|u|U|L|R including all * possible combinations and optional R delimiters: R"delim(x)delim" */ auto ch = ctx.peek(); if ( language_is_set(LANG_C | LANG_CPP) && ( ch == 'u' || ch == 'U' || ch == 'R' || ch == 'L')) { auto idx = size_t{}; auto is_real = false; if ( ch == 'u' && ctx.peek(1) == '8') { idx = 2; } else if ( unc_tolower(ch) == 'u' || ch == 'L') { idx++; } if ( language_is_set(LANG_C | LANG_CPP) && ctx.peek(idx) == 'R') { idx++; is_real = true; } const auto quote = ctx.peek(idx); if (is_real) { if ( quote == '"' && parse_cr_string(ctx, pc, idx)) { return(true); } } else if ( ( quote == '"' || quote == '\'') && parse_string(ctx, pc, idx, true)) { return(true); } } // PAWN specific stuff if (language_is_set(LANG_PAWN)) { if ( cpd.preproc_ncnl_count == 1 && ( cpd.in_preproc == CT_PP_DEFINE || cpd.in_preproc == CT_PP_EMIT)) { parse_pawn_pattern(ctx, pc, CT_MACRO); return(true); } // Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi" if ( (ctx.peek() == '\\') || (ctx.peek() == '!')) { if (ctx.peek(1) == '"') { parse_string(ctx, pc, 1, (ctx.peek() == '!')); return(true); } if ( ( (ctx.peek(1) == '\\') || (ctx.peek(1) == '!')) && (ctx.peek(2) == '"')) { parse_string(ctx, pc, 2, false); return(true); } } // handle PAWN preprocessor args %0 .. %9 if ( cpd.in_preproc == CT_PP_DEFINE && (ctx.peek() == '%') && unc_isdigit(ctx.peek(1))) { pc.Str().clear(); pc.Str().append(ctx.get()); pc.Str().append(ctx.get()); pc.SetType(CT_WORD); return(true); } } // Parse strings and character constants //parse_word(ctx, pc_temp, true); //ctx.restore(ctx.c); if (parse_number(ctx, pc)) { return(true); } if (language_is_set(LANG_D)) { // D specific stuff if (d_parse_string(ctx, pc)) { return(true); } } else { // Not D stuff // Check for L'a', L"abc", 'a', "abc", strings ch = ctx.peek(); size_t ch1 = ctx.peek(1); if ( ( ( (ch == 'L') || (ch == 'S')) && ( (ch1 == '"') || (ch1 == '\''))) || (ch == '"') || (ch == '\'') || ( (ch == '<') && cpd.in_preproc == CT_PP_INCLUDE)) { parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true); if (cpd.in_preproc == CT_PP_INCLUDE) { pc.SetParentType(CT_PP_INCLUDE); } return(true); } if ( (ch == '<') && cpd.in_preproc == CT_PP_DEFINE) { if (Chunk::GetTail()->Is(CT_MACRO)) { // We have "#define XXX <", assume '<' starts an include string parse_string(ctx, pc, 0, false); return(true); } } /* Inside clang's __has_include() could be "path/to/file.h" or system-style */ Chunk *tail = Chunk::GetTail(); if ( (ch == '(') && (tail->IsNotNullChunk()) && ( tail->Is(CT_CNG_HASINC) || tail->Is(CT_CNG_HASINCN))) { parse_string(ctx, pc, 0, false); return(true); } } // Check for Vala string templates if ( language_is_set(LANG_VALA) && (ctx.peek() == '@')) { size_t nc = ctx.peek(1); if (nc == '"') { // literal string parse_string(ctx, pc, 1, true); return(true); } } // Check for Objective C literals if ( language_is_set(LANG_OC) && (ctx.peek() == '@')) { size_t nc = ctx.peek(1); if (nc == 'R') // Issue #2720 { if (ctx.peek(2) == '"') { if (parse_cr_string(ctx, pc, 2)) // Issue #3027 { return(true); } // parse string without escaping parse_string(ctx, pc, 2, false); return(true); } } if ( (nc == '"') || (nc == '\'')) { // literal string parse_string(ctx, pc, 1, true); return(true); } if ( (nc >= '0') && (nc <= '9')) { // literal number pc.Str().append(ctx.get()); // store the '@' parse_number(ctx, pc); return(true); } } // Check for pawn/ObjectiveC/Java and normal identifiers if ( CharTable::IsKw1(ctx.peek()) || ( (ctx.peek() == '\\') && (unc_tolower(ctx.peek(1)) == 'u')) || ( (ctx.peek() == '@') && CharTable::IsKw1(ctx.peek(1)))) { parse_word(ctx, pc, false); return(true); } // Check for C++11/14/17/20 attribute specifier sequences if ( language_is_set(LANG_CPP) && ctx.peek() == '[') { if ( !language_is_set(LANG_OC) || ( prev_pc != nullptr && !prev_pc->Is(CT_OC_AT))) { if (auto length = parse_attribute_specifier_sequence(ctx)) { extract_attribute_specifier_sequence(ctx, pc, length); return(true); } } } // see if we have a punctuator char punc_txt[7]; punc_txt[0] = ctx.peek(); punc_txt[1] = ctx.peek(1); punc_txt[2] = ctx.peek(2); punc_txt[3] = ctx.peek(3); punc_txt[4] = ctx.peek(4); punc_txt[5] = ctx.peek(5); punc_txt[6] = '\0'; const chunk_tag_t *punc; if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != nullptr) { int cnt = strlen(punc->tag); while (cnt--) { pc.Str().append(ctx.get()); } pc.SetType(punc->type); pc.SetFlagBits(PCF_PUNCTUATOR); return(true); } /* When parsing C/C++ files and running into some unknown token, * check if matches Objective-C as a last resort, before * considering it as garbage. */ int probe_lang_flags = 0; if (language_is_set(LANG_C | LANG_CPP)) { probe_lang_flags = cpd.lang_flags | LANG_OC; } if (probe_lang_flags != 0) { if ((punc = find_punctuator(punc_txt, probe_lang_flags)) != nullptr) { cpd.lang_flags = probe_lang_flags; int cnt = strlen(punc->tag); while (cnt--) { pc.Str().append(ctx.get()); } pc.SetType(punc->type); pc.SetFlagBits(PCF_PUNCTUATOR); return(true); } } // throw away this character pc.SetType(CT_UNKNOWN); pc.Str().append(ctx.get()); LOG_FMT(LWARN, "%s:%zu Garbage in col %zu: %x\n", cpd.filename.c_str(), pc.GetOrigLine(), ctx.c.col, pc.GetStr()[0]); exit(EX_SOFTWARE); } // parse_next int find_disable_processing_comment_marker(const unc_text &text, std::size_t start_idx) { log_rule_B("disable_processing_cmt"); const auto &offtext = options::disable_processing_cmt(); int idx = -1; if ( !offtext.empty() && start_idx < text.size()) { if ( offtext != UNCRUSTIFY_OFF_TEXT && options::processing_cmt_as_regex()) { std::wsmatch match; std::wstring pc_wstring(text.get().cbegin() + start_idx, text.get().cend()); std::wregex criteria(std::wstring(offtext.cbegin(), offtext.cend())); std::regex_search(pc_wstring.cbegin(), pc_wstring.cend(), match, criteria); if (!match.empty()) { idx = int(match.position() + start_idx); } } else { idx = text.find(offtext.c_str(), start_idx); if (idx >= 0) { idx += int(offtext.size()); } } /** * update the position to the start of the current line */ while ( idx > 0 && text[idx - 1] != '\n') { --idx; } } return(idx); } // find_disable_processing_comment_marker int find_enable_processing_comment_marker(const unc_text &text, std::size_t start_idx) { log_rule_B("enable_processing_cmt"); const auto &ontext = options::enable_processing_cmt(); int idx = -1; if ( !ontext.empty() && start_idx < text.size()) { if ( ontext != UNCRUSTIFY_ON_TEXT && options::processing_cmt_as_regex()) { std::wsmatch match; std::wstring pc_wstring(text.get().cbegin() + start_idx, text.get().cend()); std::wregex criteria(std::wstring(ontext.cbegin(), ontext.cend())); std::regex_search(pc_wstring.cbegin(), pc_wstring.cend(), match, criteria); if (!match.empty()) { idx = int(start_idx + match.position() + match.size()); } } else { idx = text.find(ontext.c_str(), start_idx); if (idx >= 0) { idx += int(ontext.size()); } } /** * update the position to the end of the current line */ if (idx >= 0) { while ( idx < int(text.size()) && text[idx] != '\n') { ++idx; } } } return(idx); } // find_enable_processing_comment_marker void tokenize(const deque &data, Chunk *ref) { tok_ctx ctx(data); Chunk chunk; Chunk *pc = nullptr; Chunk *rprev = nullptr; bool last_was_tab = false; size_t prev_sp = 0; int num_stripped = 0; // Issue #1966 cpd.unc_stage = unc_stage_e::TOKENIZE; while (ctx.more()) { chunk.Reset(); chunk.SetPpLevel(0); if (!parse_next(ctx, chunk, pc)) { LOG_FMT(LERR, "%s:%zu Bailed before the end?\n", cpd.filename.c_str(), ctx.c.row); exit(EX_SOFTWARE); } if ( language_is_set(LANG_JAVA) && chunk.GetType() == CT_MEMBER && !memcmp(chunk.Text(), "->", 2)) { chunk.SetType(CT_LAMBDA); } // Don't create an entry for whitespace if (chunk.GetType() == CT_WHITESPACE) { last_was_tab = chunk.GetAfterTab(); prev_sp = chunk.GetOrigPrevSp(); continue; } chunk.SetOrigPrevSp(prev_sp); prev_sp = 0; if (chunk.GetType() == CT_NEWLINE) { last_was_tab = chunk.GetAfterTab(); chunk.SetAfterTab(false); chunk.Str().clear(); } else if (chunk.GetType() == CT_NL_CONT) { last_was_tab = chunk.GetAfterTab(); chunk.SetAfterTab(false); chunk.Str() = "\\\n"; } else { chunk.SetAfterTab(last_was_tab); last_was_tab = false; } num_stripped = 0; // Issue #1966 and #3565 if (chunk.GetType() != CT_IGNORED) { // Issue #1338 // Strip trailing whitespace (for CPP comments and PP blocks) while ( (chunk.GetStr().size() > 0) && ( (chunk.GetStr()[chunk.GetStr().size() - 1] == ' ') || (chunk.GetStr()[chunk.GetStr().size() - 1] == '\t'))) { // If comment contains backslash '\' followed by whitespace chars, keep last one; // this will prevent it from turning '\' into line continuation. if ( (chunk.GetStr().size() > 1) && (chunk.GetStr()[chunk.GetStr().size() - 2] == '\\')) { break; } chunk.Str().pop_back(); num_stripped++; // Issue #1966 } } // Store off the end column chunk.SetOrigColEnd(ctx.c.col - num_stripped); // Issue #1966 and #3565 // Make the whitespace we disposed of be attributed to the next chunk prev_sp = num_stripped; // Add the chunk to the list rprev = pc; if (rprev != nullptr) { pc->SetFlagBits(rprev->GetFlags() & PCF_COPY_FLAGS); // a newline can't be in a preprocessor if (pc->Is(CT_NEWLINE)) { pc->ResetFlagBits(PCF_IN_PREPROC); } } if (ref != nullptr) { chunk.SetFlagBits(PCF_INSERTED); } else { chunk.ResetFlagBits(PCF_INSERTED); } pc = chunk.CopyAndAddBefore(ref); // A newline marks the end of a preprocessor if (pc->Is(CT_NEWLINE)) // || pc->Is(CT_COMMENT_MULTI)) { cpd.in_preproc = CT_NONE; cpd.preproc_ncnl_count = 0; } // Disable indentation when #asm directive found if (pc->Is(CT_PP_ASM)) { LOG_FMT(LBCTRL, "Found a directive %s on line %zu\n", "#asm", pc->GetOrigLine()); cpd.unc_off = true; } // Special handling for preprocessor stuff if (cpd.in_preproc != CT_NONE) { pc->SetFlagBits(PCF_IN_PREPROC); // Count words after the preprocessor if (!pc->IsCommentOrNewline()) { cpd.preproc_ncnl_count++; } // Disable indentation if a #pragma asm directive is found if (cpd.in_preproc == CT_PP_PRAGMA) { if (memcmp(pc->Text(), "asm", 3) == 0) { LOG_FMT(LBCTRL, "Found a pragma %s on line %zu\n", "asm", pc->GetOrigLine()); cpd.unc_off = true; } } // Figure out the type of preprocessor for #include parsing if (cpd.in_preproc == CT_PREPROC) { if ( pc->GetType() < CT_PP_DEFINE || pc->GetType() > CT_PP_OTHER) { pc->SetType(CT_PP_OTHER); } cpd.in_preproc = pc->GetType(); } else if (cpd.in_preproc == CT_PP_IGNORE) { if ( !pc->Is(CT_NL_CONT) && !pc->IsComment()) // Issue #1966 { pc->SetType(CT_PP_IGNORE); } } else if ( cpd.in_preproc == CT_PP_DEFINE && pc->Is(CT_PAREN_CLOSE) && options::pp_ignore_define_body()) { log_rule_B("pp_ignore_define_body"); // When we have a PAREN_CLOSE in a PP_DEFINE we should be terminating a MACRO_FUNC // arguments list. Therefore we can enter the PP_IGNORE state and ignore next chunks. cpd.in_preproc = CT_PP_IGNORE; } } else { // Check for a preprocessor start if ( pc->Is(CT_POUND) && ( rprev == nullptr || rprev->Is(CT_NEWLINE))) { pc->SetType(CT_PREPROC); pc->SetFlagBits(PCF_IN_PREPROC); cpd.in_preproc = CT_PREPROC; } } if (pc->Is(CT_NEWLINE)) { LOG_FMT(LGUY, "%s(%d): orig line is %zu, orig col is %zu, , nl is %zu\n", __func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), pc->GetNlCount()); } else if (pc->Is(CT_VBRACE_OPEN)) { LOG_FMT(LGUY, "%s(%d): orig line is %zu, orig col is %zu, type is %s, orig col end is %zu\n", __func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), get_token_name(pc->GetType()), pc->GetOrigColEnd()); } else { char copy[1000]; LOG_FMT(LGUY, "%s(%d): orig line is %zu, orig col is %zu, Text() '%s', type is %s, orig col end is %zu\n", __func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), pc->ElidedText(copy), get_token_name(pc->GetType()), pc->GetOrigColEnd()); } } // Set the cpd.newline string for this file log_rule_B("newlines"); if ( options::newlines() == LE_LF || ( options::newlines() == LE_AUTO && (LE_COUNT(LF) >= LE_COUNT(CRLF)) && (LE_COUNT(LF) >= LE_COUNT(CR)))) { // LF line ends cpd.newline = "\n"; LOG_FMT(LLINEENDS, "Using LF line endings\n"); } else if ( options::newlines() == LE_CRLF || ( options::newlines() == LE_AUTO && (LE_COUNT(CRLF) >= LE_COUNT(LF)) && (LE_COUNT(CRLF) >= LE_COUNT(CR)))) { // CRLF line ends cpd.newline = "\r\n"; LOG_FMT(LLINEENDS, "Using CRLF line endings\r\n"); } else { // CR line ends cpd.newline = "\r"; LOG_FMT(LLINEENDS, "Using CR line endings\n"); } } // tokenize