;;; toy-scanner.scm ;;; A demonstration of a scanner for a [very small] subset of Scheme ;;; ;;; Programmer: Mayer Goldberg, 1999 ;;; The scanner (define char-input-stream->scheme-token-input-stream ;; ics is an input-char-stream of some sort (maybe from a string, a ;; file, an sexpr, etc (lambda (ics) (ics ;; This is how we get the streams get and unget methods. Each ;; stream comes with ITS OWN get and unget methods, and you can ;; name them anyway you like. Since the return value of the call ;; to char-input-stream->scheme-token-input-stream is an ;; input-stream (of tokens), we will need to define our own ;; get-token and unget-token with which to construct the stream, ;; so we use ics-get and ics-unget to tag the character stream ;; methods. (lambda (ics-get ics-unget) ;; The variables here are "global" as far as the state machine ;; is concerned. Here is where we keep a digit buffer, ;; symbol-character buffer, etc. ;; ;; The return hooks -- return-token-hook and ;; return-if-not-available-hook are used for storing the ;; return-token and return-end continuations just before ;; entering the state machine (since we can't pass on ;; parameters to the state machine!) (let ((return-token-hook 'uninitialised) (return-if-not-available-hook 'uninitialised) (sign-positive? 'uninitialised) (digit-buffer 'uninitialised)) ;; ;; The automaton states start here, and are prefixed ;; with "st-" ;; (letrec ((st-initial ;; This is the initial state -- "q0" in the DFA ;; lingo ... We enter the state machine ONLY from ;; here. We may need to do some basic ;; initialization at this state ... (lambda () ;; We try to get a char from the char input ;; stream ... (ics-get ;; ... a char is available! Yippie! (lambda (ch) (cond ((is-digit? ch) ;; If we see a digit we do some basic ;; initialization too ... Told you! (set! digit-buffer '()) (set! sign-positive? #t) (ics-unget ch) (st-digit)) ((is-minus? ch) (st-minus)) ((is-plus? ch) (st-plus)) ;; We skip whitespaces -- a whitespace ;; isn't a part of a token -- we go back ;; to the initial state ... ((is-whitespace? ch) (st-initial)) ;; What the **** is this??? I know! It's ;; an error! (else (return-token-hook (make-token-unknown ch))))) ;; ... no char is available! Damn! return-if-not-available-hook))) (st-plus (lambda () (ics-get (lambda (ch) (cond ((is-digit? ch) (ics-unget ch) (set! sign-positive? #t) (set! digit-buffer '()) (st-digit)) (else (return-token-hook (make-token-unknown `(#\+ ,ch)))))) (lambda () (return-token-hook (make-token-unknown #\+)))))) (st-minus (lambda () (ics-get (lambda (ch) (cond ((is-digit? ch) (ics-unget ch) (set! sign-positive? #f) (set! digit-buffer '()) (st-digit)) (else (return-token-hook (make-token-unknown `(#\- ,ch)))))) (lambda () (return-token-hook (make-token-unknown `(#\- ,ch))))))) (st-digit (lambda () (ics-get (lambda (ch) (cond ((is-digit? ch) (set! digit-buffer (cons ch digit-buffer)) (st-digit)) ((is-whitespace? ch) (return-token-hook (make-token-number (digit-buffer->integer sign-positive? digit-buffer)))) (else (return-token-hook (make-token-unknown (reverse (cons ch digit-buffer))))))) (lambda () (return-token-hook (make-token-number (digit-buffer->integer sign-positive? digit-buffer)))))))) ;; The state machine is now defined. We now begin to define ;; what we need for the scheme token input stream (stis) ;; mechanism: We need an unget buffer for putting back ;; tokens ... (let ((unget-buffer '())) (letrec ((stis-get ;; This is the method for getting tokens (lambda (return-token return-if-not-available) (if (null? unget-buffer) ;; Nothing in the unget buffer, so we ;; need to run the DFA to generate a ;; token for us (begin ;; We make the continuations accessible to ;; the state machine, so that states can ;; return tokens or return if nothing is ;; available (set! return-token-hook return-token) (set! return-if-not-available-hook return-if-not-available) ;; And we now call the initial ;; state. This starts the DFA ... ;; The return continuations will be ;; invoked from within the DFA!!! (st-initial)) (let ((tok (car unget-buffer))) ;; We get the first token in the unget ;; buffer ... (set! unget-buffer (cdr unget-buffer)) (return-token tok))))) (stis-unget ;; This is the method for ungetting tokens ... (lambda (tok) (set! unget-buffer (cons tok unget-buffer))))) ;; All you need to create a new kind of input stream is ;; a getter and an ungetter ... (^input-stream stis-get stis-unget))))))))) (define make-token-number (lambda (n) `(number ,n))) (define make-token-unknown (lambda (x) `(unknown ,x))) (define is-digit? (lambda (x) (and (char>=? x #\0) (char<=? x #\9)))) (define is-minus? (lambda (x) (char=? x #\-))) (define is-plus? (lambda (x) (char=? x #\+))) (define is-whitespace? (lambda (x) (or (char=? x #\space) (char=? x #\tab) (char=? x #\return) (char=? x #\newline) (char=? x #\page)))) (define digit-buffer->integer (lambda (is-positive? s) (letrec ((digit-buffer->integer (lambda (s multiplier) (if (null? s) 0 (+ (* (digit-char->integer (car s)) multiplier) (digit-buffer->integer (cdr s) (* 10 multiplier))))))) (let ((n (digit-buffer->integer s 1))) (if is-positive? n (- n)))))) (define digit-char->integer (let ((zero-in-ascii (char->integer #\0))) (lambda (c) (- (char->integer c) zero-in-ascii)))) ;;; Testing our scanner (define ^test (lambda (ics) (let ((its (char-input-stream->scheme-token-input-stream ics))) (its (lambda (get unget) (letrec ((token-list (lambda () (get (lambda (token) (cons token (token-list))) (lambda () '()))))) (token-list))))))) (define test-string (lambda (string) (let ((ics (string->char-input-stream string))) (^test ics)))) (define test-sexpr (lambda (sexpr) (let ((ics (sexpr->char-input-stream sexpr))) (^test ics)))) (define test-file (lambda (file-name) (let ((ics (file->char-input-stream file-name))) (^test ics))))