#!/usr/bin/env newlisp
;; @module Nlex
;; @author cormullion
;; @description newLISP source code lexer/tokenizer/parser
;; @location https://github.com/cormullion/newlisp-projects
;; @version 0.1 of 2011-09-19 08:55:19
;;<h4>About this module</h4>
;;<p>The Nlex module is a lexer/tokenizer/parser for newLISP source code.
;; An expert from StackOverflow xplains:
;; A tokenizer breaks a stream of text into tokens.
;; A lexer is basically a tokenizer, but it usually attaches extra context to the tokens.
;; A parser takes the stream of tokens from the lexer and turns it into an abstract syntax tree representing the program represented by the original text.</p>
;;<p><b>Usage</b></p>
;;<p>To tokenize/parse source code stored in symbol 'original, use <b>parse-newlisp</b>, To convert the parsed source tree back to plain source, use <b>nlx-to-plaintext</b>:</p>
;;<pre>
;;(letn ((converted (Nlex:parse-newlisp original-source)) ; parses
;; (new-original (Nlex:nlx-to-plaintext converted))) ; converts back to plain text
;;</pre>
;;<p>After this round trip, original-source and new-original should be identical.</p>
;;<p></p>
(context 'Nlex)
; class variables
(define *cursor*)
(define *source-length*)
(define *source-list*)
(define *depth*)
(define *tree*)
(define *loc*)
(define (get-next-char)
(let ((nch ""))
(if (< *cursor* *source-length*)
(begin
(set 'nch (*source-list* *cursor*))
(inc *cursor* (utf8len nch)))
(set 'nch nil))
nch))
(define (peek-char)
(let ((pch ""))
(if (< *cursor* *source-length*)
(set 'pch (*source-list* *cursor*))
(set 'pch nil))))
(define (char-identifier-first? c)
(not (find (lower-case c) [text] #;"'(){}.0123456789[/text])))
(define (char-identifier? c)
(not (find (lower-case c) { "':,()})))
(define (char-numeric-first? c)
(find c {123456789+-.0}))
(define (char-numeric? c)
(find c {0123456789+-.xXabcdefABracketedCommandDEF}))
(define (char-whitespace? c)
(or (= c " ") (= c "\n") (= c "\t")))
(define (open-paren-token)
(add-to-parse-tree '(LeftParen "(")))
(define (close-paren-token)
(add-to-parse-tree '(RightParen ")")))
(define (read-comment c)
(let ((res c) (ch ""))
(while (and (!= (set 'ch (get-next-char)) "\n") ch)
(push ch res -1))
(add-to-parse-tree (list 'Comment (string res "\n")))))
(define (read-identifier c)
(let ((res c) (ch ""))
; look for end of identifier
(while (and (not (find (set 'ch (peek-char)) " \"',()\n\t\r")) (!= ch nil))
(push (get-next-char) res -1))
(add-to-parse-tree (list 'Symbol res))))
(define (read-number-scanner list-so-far)
(let ((next-char (peek-char)))
;; if next-char is a digit then recurse
(if (and (char-numeric? next-char) next-char)
(read-number-scanner (cons (get-next-char) list-so-far))
(reverse list-so-far))))
(define (precise-float str)
; more faithful to original format than newLISP's float?
(let ((p "") (q ""))
(map set '(p q) (parse str "."))
(append p "." q)))
(define (scientific-float str)
(let ((p "") (q ""))
(map set '(p q) (parse str "e"))
(append p "e" q)))
(define (read-number c)
(let ((res '() number-as-string ""))
(set 'number-as-string (join (read-number-scanner (list c))))
(cond
; try hex first
((starts-with (lower-case number-as-string) "0x")
(set 'res (list 'Hex number-as-string)))
; scientific notation if there's an e
((find "e" (lower-case number-as-string))
(set 'res (list 'Scientific (scientific-float number-as-string))))
; float?
((find "." number-as-string)
; newLISP's float function isn't quite what we want here
(set 'res (list 'Float (precise-float number-as-string))))
; octal, not hex or float? 017 is OK, 019 is read as 10
((and (starts-with (lower-case number-as-string) "0")
(> (length number-as-string) 1)
(empty? (difference (explode number-as-string) (explode "01234567"))))
(set 'res (list 'Octal number-as-string)))
; perhaps an integer? 019 is read as 19 ...
((integer? (int number-as-string 0 10))
(set 'res (list 'Integer (int number-as-string 0 10))))
; give up
(true
(set 'res (list 'NaN "NaN"))))
(add-to-parse-tree res)))
(define (read-quote)
(add-to-parse-tree '(Quote "'")))
(define (read-quoted-string)
(let ((res {}) (ch {}))
(while (and (!= (set 'ch (get-next-char)) {"}) ch)
(push ch res -1)
; check for backslashed quotes
(when (= ch {\})
(set 'ch (get-next-char))
(push ch res -1)))
(add-to-parse-tree (list 'QuotedString res))))
(define (read-braced-string)
(let ((res "") (ch {}) (level 1))
; we've already seen the first { so we're up to level 1
(while (> level 0)
(set 'ch (get-next-char))
(if (= ch "{") (inc level))
(if (= ch "}") (dec level))
(if (or (< level 0) (= ch nil)) (throw-error (string "error in a braced string at character " *cursor*)))
; don't push final "}"
(if (and (> level 0)) (push ch res -1)))
(add-to-parse-tree (list 'BracedString res))))
(define (read-bracketed-string ch)
(let ((res "") (ch {}))
(cond
; bracketed TEXT?
((= (lower-case (join (slice *source-list* (- *cursor* 1) 6))) "[text]")
; look for final [/text]
(inc *cursor* 5)
; look for end
(while (and (< *cursor* (- *source-length* 7))
(!= (lower-case (join (*cursor* 7 *source-list*))) "[/text]")
ch)
(push (get-next-char) res -1))
(inc *cursor* 7)
(add-to-parse-tree (list 'BracketedText res)))
; bracketed CMD?
((= (lower-case (join (slice *source-list* (- *cursor* 1) 5))) "[cmd]")
; look for final [/cmd]
(inc *cursor* 4)
(while (and (< *cursor* (- *source-length* 6))
(!= (lower-case (join (*cursor* 6 *source-list*))) "[/cmd]")
ch)
(push (get-next-char) res -1))
(inc *cursor* 6)
(add-to-parse-tree (list 'BracketedCommand res)))
; must be those weird bracketed identifiers
(true
(while (and (!= (set 'ch (get-next-char)) {]}) ch)
(push ch res -1)
; check for backslashed quotes
(when (= ch {\})
(set 'ch (get-next-char))
(push ch res -1)))
(add-to-parse-tree (list 'BracketedIdentifier res))))))
(define (read-whitespace ch)
(let ((res ch))
(while (find (set 'ch (peek-char)) " \n\t")
(push (get-next-char) res -1))
(add-to-parse-tree (list 'WhiteSpace (base64-enc res)))))
(define (get-token)
(let ((first-char (get-next-char)))
(if first-char
(cond
; a - or + could be the start of a symbol or a number, so look at the next char
((or (= first-char "-") (= first-char "+"))
(if (find (peek-char) "1234567890")
(read-number first-char)
(read-identifier first-char)))
((char-whitespace? first-char)
(read-whitespace first-char))
((= first-char {(})
(open-paren-token))
((= first-char {)})
(close-paren-token))
((= first-char {#})
(read-comment first-char))
((= first-char {;})
(read-comment first-char))
((= first-char {"})
(read-quoted-string))
((= first-char "{")
(read-braced-string))
((= first-char "[")
(read-bracketed-string first-char))
((= first-char {'})
(read-quote))
((char-numeric-first? first-char)
(read-number first-char))
((char-identifier-first? first-char)
(read-identifier first-char))
(true (throw-error (string "{" first-char "} is an unrecognized token")))))))
(define (add-to-parse-tree token-pair)
(let (token (first token-pair))
(cond
((= token 'LeftParen)
(inc *depth*)
(push '((LeftParen "(")) *tree* *loc*)
(push -1 *loc*))
((= token 'RightParen)
(push '(RightParen ")") *tree* *loc*)
(dec *depth*)
(pop *loc*))
(true
(push token-pair *tree* *loc*)
true))))
(define (parse-newlisp src)
; main function: tokenize/lex/parse the string in src
(set '*depth* 0
'*tree* '()
'*loc* '(-1)
'*cursor* 0
'*source-list* (explode src)
'*source-length* (utf8len src)
'*source-length* (length *source-list*))
(while (< *cursor* *source-length*)
(get-token))
*tree*)
(define (nlx-to-plaintext nlx (depth 0))
(if (= depth 0) (set 'buff {})) ; if first pass, initialize a buffer
(dolist (element nlx)
(set 'token-type (first element) 'token-value (last element))
(if (atom? token-type)
(cond
((= token-type 'LeftParen) ; left parenthesis
(extend buff {(}))
((= token-type 'RightParen) ; right parenthesis
(extend buff {)}))
((= token-type 'WhiteSpace) ; whitespace
(dostring (s (base64-dec token-value))
(extend buff (string (char s)))))
((= token-type 'BracedString) ; braced string
(extend buff (string "{" token-value "}")))
((= token-type 'QuotedString) ; quoted string
(extend buff (string {"} token-value {"})))
((= token-type 'BracketedText) ; bracketed text
(extend buff (string {[text]} token-value {[/text]})))
((= token-type 'Quote); quote
(extend buff (string "'")))
((= token-type 'Comment) ; comment
(extend buff (string (last element) "\n")))
((= token-type 'Integer) ; int
(extend buff (string (int (last element)))))
((= token-type 'Float) ; float
(extend buff (string (precise-float (last element)))))
((= token-type 'Scientific) ; scientific notation
(extend buff (scientific-float (last element))))
((= token-type 'BracketedCommand) ; bracketed command
(extend buff (string {[cmd]} (last element) {[/cmd]})))
((or
(= token-type 'Symbol) ; close parenthesis
(= token-type 'Hex) ; hex
(= token-type 'NaN) ; not a number
(= token-type 'Octal) ; octal
)
(extend buff (string (last element))))
((= token-type 'BracketedIdentifier) ; bracketed identifier
(extend buff (string {[} (last element) {]}))))
; not an atom, so recurse but don't initialize buffer
(nlx-to-plaintext element 1)))
buff)
;eof
syntax highlighting with newLISP and newLISPdoc