#!/usr/bin/env python
#
# Lexer for the IMP language (grammar in impTokens.py)
#

import ply.lex as lex

''' 
IMP Grammar:

program ::= stmt_list EOF

number ::= the domain of (unbounded) integer numbers, with usual operations on them

identifier  ::= standard identifiers

binaryop ::= - + | - | * | / | && | <= | >=

unaryop ::= ! 

expression ::= number
    | identifier
    | expression binaryop expression
    | unaryop expression

stmt_list ::= 
    | stmt
    | stmt_list stmt

block ::= { stmt_list }

assignstmt ::= identifier = expression;
whilestmt ::= while (expression) block
ifstmt ::= if (expression) block else block

stmt ::= assgnstmt
    | ifstmt
    | whilestmt

'''


          
''' 
A Ply-based (http://www.dabeaz.com/ply/ply.html) implementation
of a lexer for the IMP language.
'''

# reserved words
reserved = ('IF', 'ELSE', 'WHILE')

tokens = reserved + (

    # literals (identifier, integer constant)
    'ID', 'ICONST',

    # operators (-,*,&&,!,<=)
    'PLUS', 'DIVIDE',
    'MINUS', 'TIMES', 
    'AND', 'NOT',
    'LE', 'GE', 'EQ',
    'MOD', 

    # assignment (+=, =)
    'EQUALS', 
    
    # delimeters ( ) { } ;
    'LPAREN', 'RPAREN',
    'LBRACE', 'RBRACE',
    'SEMI'
    )
# operators
t_PLUS             = r'\+'
t_DIVIDE           = r'/'
t_MINUS            = r'-'
t_TIMES            = r'\*'
t_AND              = r'&&'
t_NOT              = r'!'
t_LE               = r'<='
t_GE               = r'>='
t_EQ               = r'=='
t_MOD              = r'%'


# assignment operators
t_EQUALS           = r'='

# delimeters
t_LPAREN           = r'\('
t_RPAREN           = r'\)'
t_LBRACE           = r'\{'
t_RBRACE           = r'\}'
t_SEMI             = r';'

# ignored characters
t_ignore = ' \t'

# reserved words
reserved_map = {}
for r in reserved:
    reserved_map[r.lower()] = r

# identifiers
def t_ID(t):
    r'[A-Za-z_]([_\.\w]*[_\w]+)*'
    t.type = reserved_map.get(t.value,'ID')
    return t

# integer literal
t_ICONST     = r'\d+'

# newlines, count them so we know what line we are on
def t_newline(t):
    r'[\r\n]+'
    # We check for either \r or \n to accommodate Windows
    t.lexer.lineno += len(t.value)
    
# Compute column. 
#     input is the input text string
#     token is a token instance
def find_column(data, token):
    last_cr = data.rfind('\n',0,token.lexpos)
    if last_cr < 0: last_cr = 0
    column = (token.lexpos - last_cr) + 1
    return column
    
# syntactical error
def t_error(t):
    print 'IMP lexer: %s,%s: syntax error: "%s"' % \
        (t.lineno, find_column(lexer.lexdata,t), t.value[0])
    t.lexer.skip(1)
    
# Testing
lexer = lex.lex(optimize=0)
if __name__ == "__main__":
    lex.runmain(lexer)
