#!/usr/bin/newlisp

;; Creative Commons Attribution (by) License v2.5
;; Full text - http://creativecommons.org/licenses/by/2.5/
;; Contact - gordon.fischer@gmail.com, desanto@mac.com
;; Copyright (c) 2006, kozoru, Inc.

;; Updated to newLISP v.10.0, L.M. May 2009

;; SentenceBoundary.lsp
;; $Rev: 2701 $

;;  - Uses contexts CLEAN and BOUNDARY
;;  - Supplies the following function in MAIN
;;    (url-to-sentences "http://en.wikipedia.org/wiki/Grady_(band)" 15000)
;;    (url-to-sentences "http://www.bbc.co.uk/drama/spooks/series4_ep10.shtml" 10000)
   
;;  - (CLEAN:clean-html str token)
;;    If token is not nil, token will be inserted at every guaranteed sentence
;;    break (ie. after </li> </p> etc)

;;  - (BOUNDARY:GetSentences str)
;;    Breaks str on sentence boundaries.  Can be used with CLEAN:clean-html to
;;    leverage HTML markup to ensure sentence breaks on known HTML characters.
;;    (eg. (CLEAN:clean-html str BOUNDARY:g_break_token) )

(context 'CLEAN)

;;
;; Global Configuration
;;

;; The string that we use to insert upon a known sentence boundary mark (SBM).
;; We include a space after the marker to ensure proper word separation.

(set 'g_sbm "SENTBRK ")

(set 'g_start_tags (list "<h\\d(.*)>" "<blockquote(.*)>" "<p(.*)>" "<ul(.*)>" "<ol(.*)"
                   )) ;; will use PCRE replace

(set 'g_heading_tags (list {</h1>} {</H1>} {</h2>} {</H2>} {</h3>} {</H3>} {</h4>} {</H4>} {</h5>} {</H5>} 
                         {</blockquote>} {</BLOCKQUOTE>} 
                     )) ;; will use newlisp replace

(set 'g_other_tags (list {</br>} {</BR>} {<hr>} {<HR>}
                         {</p>} {</P>} 
                         {</li>} {</LI>} 
                         {</tr>} {</TR>} 
                         {</span>} {</SPAN>} 
                         {</div>} {</DIV>} 
                         {</caption>} {</CAPTION>} 
                         {</center>} {</CENTER>} {<center>} {<CENTER>}
                   )) ;; will use newlisp replace

(set 'g_break_tags (list "<br(.*)>"
                   )) ;; will use PCRE replace

(set 'g_table_tags (list {</td>} {</TD>} {</th>} {</TH>}
                   )) ;; will use newlisp replace

(set 'g_ascii_translate '(
  ;; these are removed
  ({"} "") ("[edit]" "") (" ." "")

  ;; these are converted to spaces
  ("_" " ") ("&#160;" " ") ("&nbsp;" " ")

  ;; these are special characters
  (". ." ".") (".." ".") (":.." ":")
  ("&#146;" {'}) ("\146" "'") ("&#147;" {"}) ("&#148;" {"}) ("&#151;" {-}) ("\151" "-")
  ("&nbsp;" " ") ("&amp;" "&") ("&quot;" {"}) ("&mdash;" {-})

  ;; these are some UTF-8 characters
  ("\226\128\148" " - ") ("\226\128\147" " - ")
  ("\194\163" "&pound;") ; english pound character
  ("\195\169" "&eacute;") ; accented e
  ("\226\128\162" "&bull;") ; bullet
  ("\226\128\153" "'") ; reverse tick

))

(define (asciify str)
  (dolist (a g_ascii_translate)
    (replace (a 0) str (a 1))
  )
  str
)

(define (clean-html data break_token, endHead pos)
    (unless break_token (set 'break_token ""))
    ; remove nils
    (replace "\\000" data "" 0)

    ;(replace "\n" data " ")

    ;; strip header
    (replace {<head>.*</head>} data "" 512)

    ;; strip text outside marked content
    (set 'endHead (find "<!-- start content -->" data))
    (if endHead (set 'data (slice data endHead)))

    ;; strip footer
    (set 'pos (find {<div class="printfooter">} data 1))
    (if pos (set 'data (slice data 0 pos)))

    ;; strip javascript and html comments
    (replace {<script.*</script>} data "" 513)
    (replace {<style.*</style>} data "" 513)
    (replace {<!--.*-->} data "" 513)

    ;; Sentence Boundary Helpers

    (dolist (t g_start_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy

    (dolist (t g_heading_tags) (replace t data g_sbm))

    (dolist (t g_break_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy

    (dolist (t g_other_tags) (replace t data g_sbm))

    ;; When we see these strings we believe there's
    (replace {\n\n} data g_sbm)
    (replace {|} data g_sbm)
    (replace {::} data g_sbm)

    (dolist (t g_table_tags) (replace t data (append " " t)))

    (replace {<[^>]*>} data "" 0)
    (replace "{[^}]*}" data "" 0)

    (replace (trim g_sbm) data break_token)

    (set 'data (asciify data))

    ;; This is a citation stripper - primarily for use with wikipedia
    (replace {\.\s*\[\d+\]} data "." 512)

    ; clean white space
    (replace "\\s+" data " " 0)
)

(context 'MAIN)

(context 'BOUNDARY)

(set 'MIN_SENTENCE_LENGTH 9)
(set 'MAX_SENTENCE_LENGTH 512)

;; This list contains abbreviations that are longer than 2 characters.

(set 'g_punct_regex "[\\(\\[\\]\\)\\.]")
(set 'g_break_token "</s>")

(define (GetSentences str , sentence_list word_list last_word i c final)
  ;; strip all double-quotes
  (replace {"} str "")
  (replace "\n" str " ")

  ;; We always break after an exclamation followed by a space.
  (replace "! " str (append "!" g_break_token))
  
  ;; break upon ". "
  (set 'sentence_list (parse str ". "))

  ;; break those pieces upon space
  (set 'word_list (filter if (map (fn(x) (parse x " ")) sentence_list)))
  (set 'i 0)
  (if (> (length word_list) 1)
    (while (< i (length word_list))
      ;; Take the last word in a sentence and remove any of the characters in g_punct_regex
      (set 'last_word (replace g_punct_regex (last (word_list i)) "" 1))
      (if 
          ;; If the current sentence contains only one word which is not whitespace
          ;; we join it onto the prior sentence and replace the trimmed "."
          (and (= (length (word_list i)) 1)
               (not (find " " (word_list i)))
          )
          (begin
;            (println "Current : " (string (word_list i)))
            (set 'tmp (pop word_list i))
;            (println "Joining : " (string (word_list (- i 1))) " with " (string tmp))
            (setf (word_list (- i 1) -1) (append (word_list (- i 1) -1) "."))
            (setf (word_list (- i 1)) (append (word_list (- i 1)) tmp))
          )
          ;; Otherwise this is a valid sentence break and we move onto the next sentence.
          (inc i) 
      )    
    )
  ) ;; end if
  (set 'final '())
  (dolist (w word_list)
    (push (append (join w " ") "." g_break_token) final -1)
  )
  
  (set 'final (map (fn (z) (replace "\\.+\\z" z "." 0)) (map trim (flat (map (fn(x) (parse x g_break_token)) final)))))
  (filter (fn(x) (and (> (length x) MIN_SENTENCE_LENGTH) (<= (length x) MAX_SENTENCE_LENGTH))) final)
)

(context 'MAIN)

(define (url-to-sentences url timeout)
  (BOUNDARY:GetSentences (CLEAN:clean-html (get-url url timeout) BOUNDARY:g_break_token))
)

;; eof


syntax highlighting with newLISP and syntax.cgi