(include "lib/core.scm")
(include "lib/frb.scm")
(include "lib/string.scm")
(include "lib/io.scm")
(include "lib/pair.scm")
(include "lib/vector.scm")
(include "lib/symbol.scm")
(define (lex producer consumer)
(let ((current '())
(final #f)
(last-final #f)
)
(define (build-token final)
(consumer (cons (car final) (list->string (reverse current))))
(set! current '())
(set! last-final #f))
;; defines the <step> function (DFA) from the lexer generator
(include "tests/step5.scm")
(let block-loop ((s (producer))
(state 0)
(slen 0)
(ch #f)
)
(set! slen (string-length s))
(print-string "got bytes in block-loop: ") (%printn slen)
(if (= slen 0)
(if last-final (build-token last-final) #f)
(let char-loop ((i 0))
(cond ((= i slen)
(block-loop (producer) state 0 #f))
(else
(set! ch (string-ref s i))
(set! state (step ch state))
(set! final (vector-ref finals state))
(cond ((and final (cdr final))
;; single-char final state
(build-token final)
(set! state 0)
(char-loop (+ i 1)))
((and last-final (not final))
;; multiple-char, must wait until transition out...
(build-token last-final)
(set! state 0)
(char-loop i))
(else
(set! last-final final)
(set! current (cons ch current))
(char-loop (+ i 1))))))
)))
))
(define end-of-file '(end-of-file . end-of-file))
(define (make-lex-generator filename)
(make-generator
(lambda (consumer)
(lex ((buffered-file (open filename 0) 2048) 'read-buffer) consumer)
(let forever () (consumer end-of-file) (forever))
)))
(define (string-generator s)
(lambda (consumer)
(consumer s)
(let forever () (consumer end-of-file) (forever))
))
(define (parse token-gen)
(let* ((current-token #f)
(start-of-line #t)
(saved-data #f)
)
(define (syntax-error expected)
(print-string "syntax error, expected <")
(%print expected)
(print-string "> but got ")
(%print current-token)
(print-string "\n")
(error "parse failed"))
(define (token.class tok) (car tok))
(define (token.data tok) (cdr tok))
(define (get-indent tok)
(cond ((eq? (token.class tok) 'whitespace)
;; XXX handle or disallow tabs
(string-length (token.data tok)))
;; non-whitespace at the front of a line
(else 0)))
(define next-token
;; process (i.e., filter/synthesize) the token stream
(let ((paren-stack '())
(indentation 0)
(start-of-line #t)
(held-token #f)
(tok #f)
)
(lambda ()
(let loop ()
;; I'm *not* happy with this. I feel like the whole
;; indent/dedent thing can be done more cleanly with
;; either another wrapper function or a generator...
(cond (held-token
(set! tok held-token)
(set! held-token #f))
(else
(set! tok (token-gen))))
(print-string " next-token loop: held-token= ") (%printn held-token)
(if start-of-line
;; in this state we might emit INDENT/DEDENT
(let ((this-indent (get-indent tok)))
(set! start-of-line #f)
(set! held-token tok)
(cond ((> this-indent indentation)
(set! indentation this-indent)
(cons 'indent 'indent))
((< this-indent indentation)
(set! indentation this-indent)
(cons 'dedent 'dedent))
(else
(loop))))
(case (token.class tok)
((newline)
(cond ((null? paren-stack)
(set! start-of-line #t)
(cons 'newline 'newline))
(else (loop))))
((whitespace comment) (loop))
((keyword)
;; convert keywords into their own 'class'
;; XXX downcase!
(let ((keyword (string->symbol (token.data tok))))
(cons keyword keyword)))
(else tok)))))))
(define (token) current-token)
(define (next)
(print-string "calling next-token\n")
(set! current-token (next-token))
(print-string " next-token returned ") (%printn current-token)
(set! start-of-line (eq? (token.class current-token) 'newline))
)
(define (match class)
(eq? class (token.class current-token)))
(define (match-string/drop class data)
(cond ((and (match class)
(string-=? data (token.data current-token)))
(next)
#t)
(else #f)))
(define (match/drop class)
(cond ((match class)
(next)
#t)
(else #f)))
(define (match/save class)
(cond ((match class)
(set! saved-data (token.data current-token))
(next)
#t)
(else #f)))
(define (expect class yes-proc error?)
(if (match/drop class)
(yes-proc)
(if error?
(syntax-error class)
#f)))
(define (expect2 c1 c2 yes-proc error?)
(if (match/drop c1)
(expect c2 yes-proc #t)
(if error?
(syntax-error c1)
#f)))
(define (expect/data class error?)
(cond ((match class)
(let ((data (token.data current-token)))
(next)
data))
(error?
(syntax-error class))
(else #f)))
;; >>> taking inspiration from Pyrex's recursive-descent parser <<<
;; atom: ('(' [yield_expr|testlist_gexp] ')' | '[' [listmaker] ']' | '{' [dictmaker] '}' | '`' testlist1 '`' | NAME | NUMBER | STRING+)
(define (p-atom)
(let ((tok current-token))
(case (token.class tok)
((ident)
(next)
#(varref ,(token.data tok)))
((number)
(next)
#(int ,(token.data tok)))
((string1 string2)
;; this should really be STRING+
(next)
#(string ,(token.data tok)))
((lbrace)
(next)
#(list ,(p-listmaker)))
((lbracket)
(next)
#(dict ,(p-dictmaker)))
(else
(syntax-error 'atom)))))
;; listmaker: test ( list_for | (',' test)* [','] )
;; XXX not doing <list_for> for now
(define (p-listmaker)
(let loop ((list (cons (p-test) '())))
(if (match/drop 'rbrace)
list
(if (match/drop 'comma)
(loop (cons (p-test) list))
(syntax-error 'comma)))))
;;dictmaker: test ':' test (',' test ':' test)* [',']
(define (p-dictpair)
(let ((n1 (p-test)))
(if (match/drop 'colon)
(cons n1 (p-test))
(syntax-error 'colon))))
(define (p-dictmaker)
(let loop ((list (cons (p-dictpair) '())))
(if (match/drop 'rbracket)
list
(if (match/drop 'comma)
(loop (cons (p-dictpair) list))
(syntax-error 'comma)))))
;; trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
(define (p-trailer atom)
(let loop ((trailers (cons atom '())))
(cond ((match/drop 'lparen)
(loop (cons #(call ,(p-arglist)) trailers)))
((match/drop 'lbrace)
(loop (cons (p-subscript) trailers)))
((match/drop 'getattr)
(if (match 'ident)
(let ((name (token.data current-token)))
(next)
(loop (cons #(getattr ,name) trailers)))
(syntax-error 'ident)))
(else trailers))))
;; power: atom trailer* ['**' factor]
(define (p-power)
(let* ((atom (p-atom)))
(case (token.class current-token)
((lparen lbrace getattr)
(p-trailer atom))
((power)
(next)
#(binop power ,atom ,(p-factor)))
(else
atom))))
;; factor: ('+'|'-'|'~') factor | power
(define (p-factor)
(cond ((match-string/drop 'addop "+")
(p-factor))
((match-string/drop 'addop "-")
#(unary-minus ,(p-factor)))
((match/drop 'bitnot)
#(bitnot ,(p-factor)))
(else
(p-power))))
;; helper for binary operators
(define (p-binop class p)
(let loop ((n1 (p))
(tok current-token))
(cond ((match/drop class)
(loop #(binop ,(token.data tok) ,n1 ,(p)) current-token))
(else n1))))
;; term: factor (('*'|'/'|'%'|'//') factor)*
(define (p-term)
(p-binop 'mulop p-factor))
;; arith_expr: term (('+'|'-') term)*
(define (p-arith)
(p-binop 'addop p-term))
;; shift_expr: arith_expr (('<<'|'>>') arith_expr)*
(define (p-shift)
(p-binop 'shift p-arith))
;; and_expr: shift_expr ('&' shift_expr)*
(define (p-and)
(p-binop 'bitand p-shift))
;; xor_expr: and_expr ('^' and_expr)*
(define (p-xor)
(p-binop 'bitxor p-and))
;; expr: xor_expr ('|' xor_expr)*
(define (p-expr)
(p-binop 'bitor p-xor))
;; comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
;; comparison: expr (comp_op expr)*
(define (p-compare)
(p-binop 'compare p-expr))
(define (p-is)
(let ((n1 (p-compare)))
(cond ((match/drop 'is)
(cond ((match/drop 'not)
#(binop "is-not" ,n1 ,(p-compare)))
(else
#(binop "is" ,n1 ,(p-compare)))))
(else n1))))
(define (p-in)
(let ((n1 (p-is)))
(cond ((match/drop 'in)
#(binop "in" ,n1 ,(p-is)))
((match/drop 'not)
(cond ((match/drop 'in)
#(binop "not-in" ,n1 ,(p-is)))
(else
#(binop "in" ,n1 ,(p-is)))))
(else n1))))
;; not_test: 'not' not_test | comparison
(define (p-not-test)
(if (match/drop 'not)
#(not ,(p-not-test))
(p-in)))
;; and_test: not_test ('and' not_test)*
(define (p-and-test)
(let ((n1 (p-not-test)))
(cond ((match/drop 'and)
#(and ,n1 ,(p-not-test)))
(else n1))))
;; or_test: and_test ('or' and_test)*
(define (p-or-test)
(let ((n1 (p-and-test)))
(cond ((match/drop 'or)
#(or ,n1 ,(p-and-test)))
(else n1))))
;; test: or_test ['if' or_test 'else' test] | lambdef
(define (p-test)
(if (match/drop 'lambda)
(p-lambdef)
(let ((n1 (p-or-test)))
(if (match/drop 'if)
(let ((n2 (p-or-test)))
(if (match/drop 'else)
#(test ,n1 ,n2 ,(p-test))
(syntax-error 'else)))
n1))))
;; gonna fake this for now, maybe get rid of it
(define (p-old-test)
(p-test))
;; lambdef: 'lambda' [varargslist] ':' test
(define (p-lambdef)
(error "not yet"))
;; listmaker: test ( list_for | (',' test)* [','] )
;; exprlist: expr (',' expr)* [',']
;; XXX This is nasty - how do we know when the list is over?
;; if you look at the grammar, exprlist is followed by 'in'
;; in most places. delexpr is the exception. figure this out.
(define (p-exprlist)
(let loop ((exprs '()))
(let ((expr (p-expr)))
(%printn expr)
(cond ((match/drop 'comma)
(loop (cons expr exprs)))
((eq? (token.class current-token) 'in)
;; we're done
(cons expr exprs))
(else
(syntax-error 'exprlist))))))
;; gen_iter: gen_for | gen_if
;; gen_for: 'for' exprlist 'in' or_test [gen_iter]
;; gen_if: 'if' old_test [gen_iter]
;; these three p-gen-xxx funs act more like probes than normal
;; parsing functions - they will return #f rather than raising
;; a syntax error...
(define (p-gen-for)
(if (match/drop 'for)
(let ((exprlist (p-exprlist)))
(if (match/drop 'in)
(let* ((or-test (p-or-test))
(gen-iter-probe (p-gen-iter)))
(if gen-iter-probe
#(gen-for ,exprlist ,or-test ,gen-iter-probe)
#(gen-for ,exprlist ,or-test)))
(syntax-error 'in)))
#f))
(define (p-gen-if)
(if (match/drop 'if)
(let ((old-test (p-old-test))
(gen-iter-probe (p-gen-iter)))
(if gen-iter-probe
#(gen-if ,old-test ,gen-iter-probe)
#(gen-if ,old-test)))
#f))
(define (p-gen-iter)
(case (token.class current-token)
((for) (p-gen-for))
((if) (p-gen-if))
(else #f)))
;; list_iter: list_for | list_if
;; list_for: 'for' exprlist 'in' testlist_safe [list_iter]
;; list_if: 'if' old_test [list_iter]
;; argument: test [gen_for] | test '=' test # Really [keyword '='] test
(define (p-argument)
(let ((n1 (p-test)))
(cond ((match/drop 'assign)
#(keyword ,n1 ,(p-test)))
(else
(let ((gen-for-probe (p-gen-for)))
(if gen-for-probe
#(gen ,n1)
n1))))))
;; arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)
;; Note: we'll verify correct ordering outside of the parser...
(define (p-arglist)
(let loop ((args '()))
(cond ((match/drop 'rparen)
args)
((match-string/drop 'mulop "*")
(loop (cons #(restargs ,(p-argument)) args)))
((match/drop 'power)
(loop (cons #(keyargs ,(p-argument)) args)))
((match/drop 'comma)
(loop args))
(else
(loop (cons (p-argument) args))))))
;; actual Python grammar
;; -----------------------
;; subscriptlist: subscript (',' subscript)* [',']
;; subscript: '.' '.' '.' | test | [test] ':' [test] [sliceop]
;; sliceop: ':' [test]
;; what we'll do For Now
;; ---------------------
;; subscriptlist: subscript
;; subscript: [test] ':' [test]
(define (p-subscript)
(let ((n1 (if (match/drop 'colon) #f (p-test))))
(let ((n2 (if (match 'rbrace) #f (p-test))))
#(slice ,n1 ,n2))))
;; ================================================================================
;; stmt: simple_stmt | compound_stmt
;; simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
;; how about we just 'inline' testlist here?
;; testlist: test (',' test)* [',']
(define (p-testlist)
(let loop ((list '()))
(case (token.class current-token)
;; these terminate a 'testlist'
((assign colon rbracket rbrace rparen augassign newline)
;; XXX detect >1 element, emit with #(tuple) wrapping...
list)
;; this continues one (maybe)
((comma)
(next)
(loop list))
(else
(loop (cons (p-test) list))))))
(define (p-delimited p separator)
;; parse a list of <p> delimited by <separator>
(let loop ((result (cons (p) '())))
(if (match/drop separator)
(loop (cons (p) result))
result)))
(define (p-yield-or-testlist)
(if (match/drop 'yield)
#(yield ,(p-testlist))
(p-testlist)))
;; expr_stmt: testlist (augassign (yield_expr|testlist) | ('=' (yield_expr|testlist))*)
;; augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | '<<=' | '>>=' | '**=' | '//=')
(define (p-expr-stmt)
(let ((n1 (p-testlist)))
(cond ((match/drop 'assign)
#(assign ,(cons n1 (p-delimited p-yield-or-testlist 'assign))))
((match/save 'augassign)
#(augassign ,saved-data ,n1 ,(p-yield-or-testlist)))
(else n1))))
;; small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt |
;; import_stmt | global_stmt | exec_stmt | assert_stmt)
(define (p-small-stmt)
(cond ((match/drop 'del)
#(del ,(p-exprlist)))
((match/drop 'pass)
#(pass))
((match/drop 'break)
#(break))
((match/drop 'continue)
#(continue))
((match/drop 'return)
#(return ,(p-testlist)))
((match/drop 'yield)
#(yield ,(p-testlist)))
((match/drop 'raise)
;; newer raise stmt only
#(raise ,(p-test)))
;; XXX import, global, exec, assert...
(else
(p-expr-stmt))
))
;; single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
(define (p-single-input)
(case (token.class current-token)
((newline) (next) #f)
((while if for try def class) (p-stmt))
(else (print-string "p-single-input goes for p-simple-stmt\n") (p-simple-stmt))))
;; simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
;; small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt |
;; import_stmt | global_stmt | exec_stmt | assert_stmt)
(define (p-simple-stmt)
(let loop ((stmts (cons (p-small-stmt) '())))
(print-string "simple loop: ") (%print current-token) (%printn stmts)
3141
(cond ((match/drop 'newline)
(print-string "returning?\n")
(vector (reverse stmts)))
;#(sequence ,(reverse stmts)))
((match/drop 'semicolon)
(loop (cons (p-small-stmt) stmts)))
(else
(syntax-error 'simple-stmt)))))
;; stmt: simple_stmt | compound_stmt
;; compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef
(define (p-stmt)
(case (token.class current-token)
((if) (next) (p-if-stmt))
((while) (next) (p-while-stmt))
((for) (next) (p-for-stmt))
((try) (next) (p-try-stmt))
((def) (next) (p-funcdef))
((class) (next) (p-classdef))
(else (p-simple-stmt))))
;; print_stmt: 'print' [testlist]
;; del_stmt: 'del' exprlist
;; pass_stmt: 'pass'
;; flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
;; break_stmt: 'break'
;; continue_stmt: 'continue'
;; return_stmt: 'return' [testlist]
;; yield_stmt: yield_expr
;; raise_stmt: 'raise' [test [',' test [',' test]]]
;; if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
;; while_stmt: 'while' test ':' suite ['else' ':' suite]
;; for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
;; try_stmt: ('try' ':' suite
;; ((except_clause ':' suite)+
;; ['else' ':' suite]
;; ['finally' ':' suite] |
;; 'finally' ':' suite))
;; # NB compile.c makes sure that the default except clause is last
;; except_clause: 'except' [test [',' test]]
;; suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
(define (p-suite)
(if (match/drop 'newline)
(if (match/drop 'indent)
(let loop ((stmts (cons (p-stmt) '())))
(if (match/drop 'dedent)
#(suite ,(reverse stmts))
(loop (cons (p-stmt) stmts))))
(syntax-error 'indent))
(p-simple-stmt)))
;; if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
(define (p-if-stmt)
(let* ((i-test (p-test))
(i-body (expect 'colon p-suite #t))
(i-cases
(let loop ((cases '()))
(cond ((match/drop 'elif)
(let* ((test1 (p-test))
(body1 (expect 'colon p-suite #t)))
(loop (cons #(elif ,test1 ,body1) cases))))
((match/drop 'else)
(let* ((body1 (expect 'colon p-suite #t)))
(reverse (cons #(else ,body1) cases))))
(else
(reverse cases))))))
#(if ,i-test ,i-body ,i-cases)))
;; while_stmt: 'while' test ':' suite ['else' ':' suite]
(define (p-while-stmt)
(let* ((w-test (p-test))
(w-body (expect 'colon p-suite #t))
(w-else (expect2 'else 'colon p-suite #f)))
#(while ,w-test ,w-body ,w-else)))
;; for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
(define (p-for-stmt)
(let* ((for-clause (p-exprlist))
(in-clause (expect 'in p-testlist #t))
(body-clause (expect 'colon p-suite #t))
(else-clause (expect2 'else 'colon p-suite #f)))
#(for ,for-clause ,in-clause ,body-clause ,else-clause)))
;; except_clause: 'except' [test [',' test]]
(define (p-except-clause)
(if (match 'colon)
#(except #f #f)
(let ((n1 (p-test)))
(if (match/drop 'comma)
#(except ,n1 ,(p-test))
#(except ,n1 #f)))))
;; try_stmt: ('try' ':' suite
;; ((except_clause ':' suite)+
;; ['else' ':' suite]
;; ['finally' ':' suite] |
;; 'finally' ':' suite))
(define (p-try-stmt)
(let* ((try-clause (expect 'colon p-suite #t))
(except-clauses
(let loop ((l '()))
(if (match/drop 'except)
(loop (cons (p-except-clause) l))
l)))
(else-clause (expect2 'else 'colon p-suite #f))
(finally-clause (expect2 'finally 'colon p-suite #f)))
#(try ,try-clause ,except-clauses ,else-clause ,finally-clause)))
;; with_stmt: 'with' test [ with_var ] ':' suite
;; with_var: ('as' | NAME) expr
;; XXX I'm not going to bother with with right now.
;; fpdef: NAME | '(' fplist ')'
;; fplist: fpdef (',' fpdef)* [',']
(define (p-fpdef)
(let* ((name (expect/data 'ident #f)))
(if name
name
(expect 'lparen p-fplist #t))))
(define (p-fplist)
(let loop ((fpdefs (cons (p-fpdef) '())))
(cond ((match/drop 'comma)
(loop (cons (p-fpdef) fpdefs)))
((match/drop 'rparen)
fpdefs)
(else
(syntax-error 'rparen)))))
;; parameters: '(' [varargslist] ')'
;; varargslist: ((fpdef ['=' test] ',')*
;; ('*' NAME [',' '**' NAME] | '**' NAME) |
;; fpdef ['=' test] (',' fpdef ['=' test])* [','])
(define (p-varargslist)
(let loop ((formals '()))
(cond ((match/drop 'rparen)
(reverse formals))
((match/drop 'power)
(loop (cons #(keyargs ,(expect/data 'ident #t)) formals)))
((match-string/drop 'mulop "*")
(loop (cons #(restargs ,(expect/data 'ident #t)) formals)))
(else
(let ((fpdef (p-fpdef)))
(%printn fpdef)
(cond ((match/drop 'comma)
(loop (cons fpdef formals)))
((match/drop 'assign)
(loop (cons #(default ,fpdef ,(p-test)) formals)))
((match/drop 'rparen)
(reverse (cons fpdef formals)))
(else
(syntax-error 'rparen))
)
)))))
;; funcdef: [decorators] 'def' NAME parameters ':' suite
;; XXX no decorators for now
(define (p-funcdef)
(let* ((name (expect/data 'ident #t))
(params (expect 'lparen p-varargslist #t))
(body (expect 'colon p-suite #t)))
#(function ,name ,params ,body)))
;; classdef: 'class' NAME ['(' [testlist] ')'] ':' suite
(define (p-classdef)
(let* ((name (expect/data 'ident #t))
(subclasses (expect 'lparen p-testlist #f))
(body (expect 'colon p-suite #t)))
#(class ,name ,subclasses ,body)))
;; ================================================================================
(next)
(let loop ()
(%printn (p-single-input))
(loop)
)
))
;; ah hah!
;; here's what's going on: let's say the last token you want is 'newline.
;; so, the last thing you do is match/drop it. However, match/drop will
;; call (next) in order to load a new token. Lexer will hang on that call
;; unless there's more data to be had. Uff da!
(define stdin-fd 0)
(define stdout-fd 1)
(let ((infile (buffered-file stdin-fd 2048)))
(write stdout-fd "] ")
;; (token-gen (make-lex-generator filename))
(parse infile))