diff options
| author | Michele Calgaro <michele.calgaro@yahoo.it> | 2023-11-18 17:53:35 +0900 | 
|---|---|---|
| committer | Michele Calgaro <michele.calgaro@yahoo.it> | 2023-11-19 19:27:29 +0900 | 
| commit | c0a6f1b84c84749908961579b84513fd9f9d9eac (patch) | |
| tree | ace7ba60cb031acd3a1f4ff10f7bbc5668fa801f /debian/uncrustify-trinity/uncrustify-trinity-0.78.0/scripts/tokenizer.py | |
| parent | 52e5ffe140f0f4402e97936447bc9a606045d2b5 (diff) | |
| download | extra-dependencies-c0a6f1b84c84749908961579b84513fd9f9d9eac.tar.gz extra-dependencies-c0a6f1b84c84749908961579b84513fd9f9d9eac.zip | |
uncrustify-trinity: updated based on upstream version 0.78.0
Signed-off-by: Michele Calgaro <michele.calgaro@yahoo.it>
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.78.0/scripts/tokenizer.py')
| -rwxr-xr-x | debian/uncrustify-trinity/uncrustify-trinity-0.78.0/scripts/tokenizer.py | 316 | 
1 files changed, 316 insertions, 0 deletions
| diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.78.0/scripts/tokenizer.py b/debian/uncrustify-trinity/uncrustify-trinity-0.78.0/scripts/tokenizer.py new file mode 100755 index 00000000..0bc33bac --- /dev/null +++ b/debian/uncrustify-trinity/uncrustify-trinity-0.78.0/scripts/tokenizer.py @@ -0,0 +1,316 @@ +#! /usr/bin/env python +# tokenize.py +# +# Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of +# tuples (string, type) +# + +# punctuator lookup table +punc_table = [ +   [ '!',  25,  26, '!'   ],   #   0: '!' +   [ '#',  24,  35, '#'   ],   #   1: '#' +   [ '$',  23,   0, '$'   ],   #   2: '$' +   [ '%',  22,  36, '%'   ],   #   3: '%' +   [ '&',  21,  41, '&'   ],   #   4: '&' +   [ '(',  20,   0, '('   ],   #   5: '(' +   [ ')',  19,   0, ')'   ],   #   6: ')' +   [ '*',  18,  43, '*'   ],   #   7: '*' +   [ '+',  17,  44, '+'   ],   #   8: '+' +   [ ',',  16,   0, ','   ],   #   9: ',' +   [ '-',  15,  46, '-'   ],   #  10: '-' +   [ '.',  14,  50, '.'   ],   #  11: '.' +   [ '/',  13,  53, '/'   ],   #  12: '/' +   [ ':',  12,  54, ':'   ],   #  13: ':' +   [ ';',  11,   0, ';'   ],   #  14: ';' +   [ '<',  10,  56, '<'   ],   #  15: '<' +   [ '=',   9,  63, '='   ],   #  16: '=' +   [ '>',   8,  65, '>'   ],   #  17: '>' +   [ '?',   7,   0, '?'   ],   #  18: '?' +   [ '[',   6,  70, '['   ],   #  19: '[' +   [ ']',   5,   0, ']'   ],   #  20: ']' +   [ '^',   4,  71, '^'   ],   #  21: '^' +   [ '{',   3,   0, '{'   ],   #  22: '{' +   [ '|',   2,  72, '|'   ],   #  23: '|' +   [ '}',   1,   0, '}'   ],   #  24: '}' +   [ '~',   0,  74, '~'   ],   #  25: '~' +   [ '<',   3,  30, '!<'  ],   #  26: '!<' +   [ '=',   2,  33, '!='  ],   #  27: '!=' +   [ '>',   1,  34, '!>'  ],   #  28: '!>' +   [ '~',   0,   0, '!~'  ],   #  29: '!~' +   [ '=',   1,   0, '!<=' ],   #  30: '!<=' +   [ '>',   0,  32, '!<>' ],   #  31: '!<>' +   [ '=',   0,   0, '!<>='],   #  32: '!<>=' +   [ '=',   0,   0, '!==' ],   #  33: '!==' +   [ '=',   0,   0, '!>=' ],   #  34: '!>=' +   [ '#',   0,   0, '##'  ],   #  35: '##' +   [ ':',   2,  39, '%:'  ],   #  36: '%:' +   [ '=',   1,   0, '%='  ],   #  37: '%=' +   [ '>',   0,   0, '%>'  ],   #  38: '%>' +   [ '%',   0,  40, None  ],   #  39: '%:%' +   [ ':',   0,   0, '%:%:'],   #  40: '%:%:' +   [ '&',   1,   0, '&&'  ],   #  41: '&&' +   [ '=',   0,   0, '&='  ],   #  42: '&=' +   [ '=',   0,   0, '*='  ],   #  43: '*=' +   [ '+',   1,   0, '++'  ],   #  44: '++' +   [ '=',   0,   0, '+='  ],   #  45: '+=' +   [ '-',   2,   0, '--'  ],   #  46: '--' +   [ '=',   1,   0, '-='  ],   #  47: '-=' +   [ '>',   0,  49, '->'  ],   #  48: '->' +   [ '*',   0,   0, '->*' ],   #  49: '->*' +   [ '*',   1,   0, '.*'  ],   #  50: '.*' +   [ '.',   0,  52, '..'  ],   #  51: '..' +   [ '.',   0,   0, '...' ],   #  52: '...' +   [ '=',   0,   0, '/='  ],   #  53: '/=' +   [ ':',   1,   0, '::'  ],   #  54: '::' +   [ '>',   0,   0, ':>'  ],   #  55: ':>' +   [ '%',   4,   0, '<%'  ],   #  56: '<%' +   [ ':',   3,   0, '<:'  ],   #  57: '<:' +   [ '<',   2,  61, '<<'  ],   #  58: '<<' +   [ '=',   1,   0, '<='  ],   #  59: '<=' +   [ '>',   0,  62, '<>'  ],   #  60: '<>' +   [ '=',   0,   0, '<<=' ],   #  61: '<<=' +   [ '=',   0,   0, '<>=' ],   #  62: '<>=' +   [ '=',   0,  64, '=='  ],   #  63: '==' +   [ '=',   0,   0, '===' ],   #  64: '===' +   [ '=',   1,   0, '>='  ],   #  65: '>=' +   [ '>',   0,  67, '>>'  ],   #  66: '>>' +   [ '=',   1,   0, '>>=' ],   #  67: '>>=' +   [ '>',   0,  69, '>>>' ],   #  68: '>>>' +   [ '=',   0,   0, '>>>='],   #  69: '>>>=' +   [ ']',   0,   0, '[]'  ],   #  70: '[]' +   [ '=',   0,   0, '^='  ],   #  71: '^=' +   [ '=',   1,   0, '|='  ],   #  72: '|=' +   [ '|',   0,   0, '||'  ],   #  73: '||' +   [ '=',   1,   0, '~='  ],   #  74: '~=' +   [ '~',   0,   0, '~~'  ],   #  75: '~~' +] + + +# +# Token types: +#  0 = newline +#  1 = punctuator +#  2 = integer +#  3 = float +#  4 = string +#  5 = identifier +# +class Tokenizer: +    def __init__(self): +        self.tokens = [] +        self.text = '' +        self.text_idx = 0 + +    def tokenize_text(self, in_text): +        self.tokens = [] +        self.text = in_text +        self.text_idx = 0 + +        print(in_text) +        try: +            while self.text_idx < len(self.text): +                if self.parse_whitespace(): +                    continue +                elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n': +                    self.text_idx += 2 +                    continue +                elif self.parse_comment(): +                    continue +                elif self.parse_number(): +                    continue +                elif self.parse_identifier(): +                    continue +                elif self.parse_string(): +                    continue +                elif self.parse_punctuator(): +                    continue +                else: +                    print("confused: %s" % self.text[self.text_idx:]) +                    break +        except: +            print("bombed") +            raise + +    def parse_whitespace(self): +        start_idx = self.text_idx +        hit_newline = False +        while self.text_idx < len(self.text): +            if self.text[self.text_idx] in '\n\r': +                hit_newline = True +            elif not self.text[self.text_idx] in ' \t': +                break +            self.text_idx += 1 + +        if hit_newline: +            self.tokens.append(('\n', 0)) +        return start_idx != self.text_idx + +    def parse_comment(self): +        if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*': +            return False +        if self.text[self.text_idx + 1] == '/': +            while self.text_idx < len(self.text): +                if self.text[self.text_idx] in '\n\r': +                    break +                self.text_idx += 1 +        else: +            while self.text_idx < len(self.text) - 1: +                if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/': +                    self.text_idx += 2 +                    break +                self.text_idx += 1 +        return True + +    def parse_identifier(self): +        if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ': +            return False +        start_idx = self.text_idx +        while self.text_idx < len(self.text) and \ +                self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890': +            self.text_idx += 1 +        self.tokens.append((self.text[start_idx : self.text_idx], 5)) +        return True + +    def parse_string(self): +        starter = 0 +        start_ch = self.text[self.text_idx] +        if start_ch == 'L': +            starter = 1 +            start_ch = self.text[self.text_idx + 1] +        if not start_ch in '"\'': +            return False +        start_idx = self.text_idx +        self.text_idx += starter + 1 +        escaped = False +        while self.text_idx < len(self.text): +            if escaped: +                escaped = False +            else: +                if self.text[self.text_idx] == '\\': +                    escaped = True +                elif self.text[self.text_idx] == start_ch: +                    self.text_idx += 1 +                    break +            self.text_idx += 1 + +        self.tokens.append((self.text[start_idx : self.text_idx], 4)) +        return True + +    # Checks for punctuators +    # Returns whether a punctuator was consumed (True or False) +    def parse_punctuator(self): +        tab_idx = 0 +        punc_len = 0 +        saved_punc = None +        while 1: +            pte = punc_table[tab_idx] +            if pte[0] == self.text[self.text_idx]: +                if pte[3] is not None: +                    saved_punc = pte[3] +                self.text_idx += 1 +                tab_idx = pte[2] +                if tab_idx == 0: +                    break +            elif pte[1] == 0: +                break +            else: +                tab_idx += 1 +        if saved_punc is not None: +            self.tokens.append((saved_punc, 1)) +            return True +        return False + +    def parse_number(self): +        # A number must start with a digit or a dot followed by a digit +        ch = self.text[self.text_idx] +        if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()): +            return False +        token_type = 2 # integer +        if ch == '.': +            token_type = 3 # float +        did_hex = False +        start_idx = self.text_idx + +        # Check for Hex, Octal, or Binary +        # Note that only D and Pawn support binary, but who cares? +        # +        if ch == '0': +            self.text_idx += 1 +            ch = self.text[self.text_idx].upper() +            if ch == 'X':                # hex +                did_hex = True +                self.text_idx += 1 +                while self.text[self.text_idx] in '_0123456789abcdefABCDEF': +                    self.text_idx += 1 +            elif ch == 'B':              # binary +                self.text_idx += 1 +                while self.text[self.text_idx] in '_01': +                    self.text_idx += 1 +            elif ch >= '0' and ch <= 7:  # octal (but allow decimal) +                self.text_idx += 1 +                while self.text[self.text_idx] in '_0123456789': +                    self.text_idx += 1 +            else: +                # either just 0 or 0.1 or 0UL, etc +                pass +        else: +            # Regular int or float +            while self.text[self.text_idx] in '_0123456789': +                self.text_idx += 1 + +        # Check if we stopped on a decimal point +        if self.text[self.text_idx] == '.': +            self.text_idx += 1 +            token_type = 3 # float +            if did_hex: +                while self.text[self.text_idx] in '_0123456789abcdefABCDEF': +                    self.text_idx += 1 +            else: +                while self.text[self.text_idx] in '_0123456789': +                    self.text_idx += 1 + +        # Check exponent +        # Valid exponents per language (not that it matters): +        # C/C++/D/Java: eEpP +        # C#/Pawn:      eE +        if self.text[self.text_idx] in 'eEpP': +            token_type = 3 # float +            self.text_idx += 1 +            if self.text[self.text_idx] in '+-': +                self.text_idx += 1 +            while self.text[self.text_idx] in '_0123456789': +                self.text_idx += 1 + +        # Check the suffixes +        # Valid suffixes per language (not that it matters): +        #        Integer       Float +        # C/C++: uUlL          lLfF +        # C#:    uUlL          fFdDMm +        # D:     uUL           ifFL +        # Java:  lL            fFdD +        # Pawn:  (none)        (none) +        # +        # Note that i, f, d, and m only appear in floats. +        while 1: +            if self.text[self.text_idx] in 'tTfFdDmM': +                token_type = 3 # float +            elif not self.text[self.text_idx] in 'lLuU': +                break +            self.text_idx += 1 + +        self.tokens.append((self.text[start_idx : self.text_idx], token_type)) +        return True + +text = """ +1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there" +123 // some comment +a = b + c; +#define abc \\ +        5 +d = 5 /* hello */ + 3; +""" + +t = Tokenizer() +t.tokenize_text(text) +print(t.tokens) + | 
