#!/usr/bin/newlisp
;; Creative Commons Attribution (by) License v2.5
;; Full text - http://creativecommons.org/licenses/by/2.5/
;; Contact - gordon.fischer@gmail.com, desanto@mac.com
;; Copyright (c) 2006, kozoru, Inc.
;; Updated to newLISP v.10.0, L.M. May 2009
;; SentenceBoundary.lsp
;; $Rev: 2701 $
;; - Uses contexts CLEAN and BOUNDARY
;; - Supplies the following function in MAIN
;; (url-to-sentences "http://en.wikipedia.org/wiki/Grady_(band)" 15000)
;; (url-to-sentences "http://www.bbc.co.uk/drama/spooks/series4_ep10.shtml" 10000)
;; - (CLEAN:clean-html str token)
;; If token is not nil, token will be inserted at every guaranteed sentence
;; break (ie. after </li> </p> etc)
;; - (BOUNDARY:GetSentences str)
;; Breaks str on sentence boundaries. Can be used with CLEAN:clean-html to
;; leverage HTML markup to ensure sentence breaks on known HTML characters.
;; (eg. (CLEAN:clean-html str BOUNDARY:g_break_token) )
(context 'CLEAN)
;;
;; Global Configuration
;;
;; The string that we use to insert upon a known sentence boundary mark (SBM).
;; We include a space after the marker to ensure proper word separation.
(set 'g_sbm "SENTBRK ")
(set 'g_start_tags (list "<h\\d(.*)>" "<blockquote(.*)>" "<p(.*)>" "<ul(.*)>" "<ol(.*)"
)) ;; will use PCRE replace
(set 'g_heading_tags (list {</h1>} {</H1>} {</h2>} {</H2>} {</h3>} {</H3>} {</h4>} {</H4>} {</h5>} {</H5>}
{</blockquote>} {</BLOCKQUOTE>}
)) ;; will use newlisp replace
(set 'g_other_tags (list {</br>} {</BR>} {<hr>} {<HR>}
{</p>} {</P>}
{</li>} {</LI>}
{</tr>} {</TR>}
{</span>} {</SPAN>}
{</div>} {</DIV>}
{</caption>} {</CAPTION>}
{</center>} {</CENTER>} {<center>} {<CENTER>}
)) ;; will use newlisp replace
(set 'g_break_tags (list "<br(.*)>"
)) ;; will use PCRE replace
(set 'g_table_tags (list {</td>} {</TD>} {</th>} {</TH>}
)) ;; will use newlisp replace
(set 'g_ascii_translate '(
;; these are removed
({"} "") ("[edit]" "") (" ." "")
;; these are converted to spaces
("_" " ") (" " " ") (" " " ")
;; these are special characters
(". ." ".") (".." ".") (":.." ":")
("’" {'}) ("\146" "'") ("“" {"}) ("”" {"}) ("—" {-}) ("\151" "-")
(" " " ") ("&" "&") (""" {"}) ("—" {-})
;; these are some UTF-8 characters
("\226\128\148" " - ") ("\226\128\147" " - ")
("\194\163" "£") ; english pound character
("\195\169" "é") ; accented e
("\226\128\162" "•") ; bullet
("\226\128\153" "'") ; reverse tick
))
(define (asciify str)
(dolist (a g_ascii_translate)
(replace (a 0) str (a 1))
)
str
)
(define (clean-html data break_token, endHead pos)
(unless break_token (set 'break_token ""))
; remove nils
(replace "\\000" data "" 0)
;(replace "\n" data " ")
;; strip header
(replace {<head>.*</head>} data "" 512)
;; strip text outside marked content
(set 'endHead (find "<!-- start content -->" data))
(if endHead (set 'data (slice data endHead)))
;; strip footer
(set 'pos (find {<div class="printfooter">} data 1))
(if pos (set 'data (slice data 0 pos)))
;; strip javascript and html comments
(replace {<script.*</script>} data "" 513)
(replace {<style.*</style>} data "" 513)
(replace {<!--.*-->} data "" 513)
;; Sentence Boundary Helpers
(dolist (t g_start_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy
(dolist (t g_heading_tags) (replace t data g_sbm))
(dolist (t g_break_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy
(dolist (t g_other_tags) (replace t data g_sbm))
;; When we see these strings we believe there's
(replace {\n\n} data g_sbm)
(replace {|} data g_sbm)
(replace {::} data g_sbm)
(dolist (t g_table_tags) (replace t data (append " " t)))
(replace {<[^>]*>} data "" 0)
(replace "{[^}]*}" data "" 0)
(replace (trim g_sbm) data break_token)
(set 'data (asciify data))
;; This is a citation stripper - primarily for use with wikipedia
(replace {\.\s*\[\d+\]} data "." 512)
; clean white space
(replace "\\s+" data " " 0)
)
(context 'MAIN)
(context 'BOUNDARY)
(set 'MIN_SENTENCE_LENGTH 9)
(set 'MAX_SENTENCE_LENGTH 512)
;; This list contains abbreviations that are longer than 2 characters.
(set 'g_punct_regex "[\\(\\[\\]\\)\\.]")
(set 'g_break_token "</s>")
(define (GetSentences str , sentence_list word_list last_word i c final)
;; strip all double-quotes
(replace {"} str "")
(replace "\n" str " ")
;; We always break after an exclamation followed by a space.
(replace "! " str (append "!" g_break_token))
;; break upon ". "
(set 'sentence_list (parse str ". "))
;; break those pieces upon space
(set 'word_list (filter if (map (fn(x) (parse x " ")) sentence_list)))
(set 'i 0)
(if (> (length word_list) 1)
(while (< i (length word_list))
;; Take the last word in a sentence and remove any of the characters in g_punct_regex
(set 'last_word (replace g_punct_regex (last (word_list i)) "" 1))
(if
;; If the current sentence contains only one word which is not whitespace
;; we join it onto the prior sentence and replace the trimmed "."
(and (= (length (word_list i)) 1)
(not (find " " (word_list i)))
)
(begin
; (println "Current : " (string (word_list i)))
(set 'tmp (pop word_list i))
; (println "Joining : " (string (word_list (- i 1))) " with " (string tmp))
(setf (word_list (- i 1) -1) (append (word_list (- i 1) -1) "."))
(setf (word_list (- i 1)) (append (word_list (- i 1)) tmp))
)
;; Otherwise this is a valid sentence break and we move onto the next sentence.
(inc i)
)
)
) ;; end if
(set 'final '())
(dolist (w word_list)
(push (append (join w " ") "." g_break_token) final -1)
)
(set 'final (map (fn (z) (replace "\\.+\\z" z "." 0)) (map trim (flat (map (fn(x) (parse x g_break_token)) final)))))
(filter (fn(x) (and (> (length x) MIN_SENTENCE_LENGTH) (<= (length x) MAX_SENTENCE_LENGTH))) final)
)
(context 'MAIN)
(define (url-to-sentences url timeout)
(BOUNDARY:GetSentences (CLEAN:clean-html (get-url url timeout) BOUNDARY:g_break_token))
)
;; eof
syntax highlighting with newLISP and syntax.cgi