{- Tim's naiive-Bayes spam-identifying Haskell implementation. 
   (C) Tim Haynes, 2003    <mailto:pigbayes@spodzone.org.uk>

   Redistributable under the terms of the BSD License, see
   <http://www.opensource.org/licenses/bsd-license.php>

   Known to work well with ghc, moderately OK with nhc98 and not very well
   with hugs98.

   Requires a ~/.pigbayes/ directory with empty ham.dat,spam.dat files to
   start with.
-}

import IO
import System
import List
import Char (toUpper,ord,chr,showLitChar)
import Maybe
import Directory (renameFile)

type Count=Int
type Word=String
type Freq = (Word,Count)
type Score=Freq
type FreqList = [Freq]
type Index = [(Char,Int)]

type ScoreList = [(Char,[Score])]

toFloat :: Int -> Float
toFloat n = fromInteger (toInteger n) :: Float

tokenize :: String -> [Word]
tokenize str = let l=lines str 
		   idx=elemIndex "" l
		   i=if isNothing idx then 0 else fromJust idx
		   interesting=words (unlines (snd (splitAt i l)))
		in
		   filter ((<12).length) (filter ((>4).length) interesting)


--Parse a (word,score) pair-list
scores :: String -> ScoreList
scores foo = let w= map words (lines foo)
		 pairs= map (\t -> (t !!0, (read (t !! 1) :: Int)))  w
		 inits=[chr i | i<-[0..255], i /= 32, i/=8]
	     in
		map (\c ->  (c, ((showLitChar c ""), 0):
				 [ (w,s) | (w,s) <- pairs, c==head w ]))
		    inits

-- convert a frequency-list to a score-list
flToSl :: FreqList -> ScoreList
--flToSl fl = scores (formatPairToString fl)
flToSl fl = let w=map fst fl
		inits=map chr [0..255]
		in
		map (\c -> (c, [(w,s) | (w,s) <- fl, c==head w]))
		    inits

-- return the score part of a scorelist
scorePart :: ScoreList -> [Score]
scorePart sl = foldr1 (++) (map snd sl)

-- return the score of a particular word
scoreLookup :: Word -> ScoreList -> Count
scoreLookup w sc = let wl=head [b | (a,b) <- sc, a==head w]
		       idx=find ((==w).fst) wl
		       in
		       if isNothing idx 
			  then 0
			  else snd (fromJust idx)

freqLookup :: Word -> FreqList -> Count
--freqLookup w fl = sum [b | (a,b)<-fl, a==w]
freqLookup w fl = let f=find ((==w).fst) fl
		      in
		      if isNothing f 
		      then 0
		      else snd (fromJust f)

-- Sum the frequencies in a frequency-list
sumFreqs :: FreqList -> Count
sumFreqs fl = sum (map snd fl)

-- Generate a frequency-list from a list of words
freqThings :: [Word] -> FreqList
-- freqThings lst = map (\f -> (f, countInstances f lst)) (nub lst)
freqThings list = map (\x -> (head x, length x)) (group (sort list))

freqScoreMatch :: FreqList -> ScoreList -> [(Word,Count)]
freqScoreMatch fs sl = [(w, ci*(scoreLookup w sl)) | (w,ci)<-fs]

--topFew :: Int -> [a] -> [a]
topFew i [] = []
topFew 0 _ = []
topFew i (a:as) = a : (topFew (i-1) as)

-- How well does a word frequency-list compare against a category?
compareFreq :: FreqList -> ScoreList -> Float
compareFreq fs sl = toFloat (foldr1 (+) (map snd (freqScoreMatch fs sl)))
		    / toFloat (sumFreqs fs)
		      / toFloat (sum (map snd (scorePart sl)))

-- Return a verdict    
decision :: Float -> Float -> String
decision sr hr = if (sr>hr*1.05) then "Spam" else "Ham"

-- format pairs as strings
formatPairToString :: [Score] -> String
formatPairToString sc = unlines [ w ++ " " ++ show s | (w,s) <- sc ]

-- return a list of those scores which are significantly more in toks than ref
relativeScores :: [Score] -> ScoreList -> FreqList
relativeScores toks ref = [ (a,b) | (a,b)<-toks, 
			    b>0,
			    b>=2 *(scoreLookup a ref) ]


-- combine & dedupe two lists of scores

mergeScores :: ScoreList -> FreqList -> [Score]
mergeScores as fs = let asl=scorePart as
			w=asl++fs
			in 
                        map (\l -> (fst (head l), foldr1 (+) (map snd l))) 
				(groupBy (\a b -> fst a == fst b) 
				 (sortBy
				  (\a b -> if fst a < fst b then LT else GT)
				  w))

main = do
        home <- getEnv "HOME"
        hamScores  <- readFile (home++"/.pigbayes/ham.dat")
        spamScores <- readFile (home++"/.pigbayes/spam.dat")
        a <- getArgs
        c <- getContents

        let hS = scores hamScores
            sS = scores spamScores
            t=tokenize c
            ft=freqThings (sort t)
            spamRating=compareFreq ft sS
            hamRating= compareFreq ft hS
            ftS= ft
            dec=decision spamRating hamRating
            wl=if dec=="Spam" then freqScoreMatch ft sS
                else freqScoreMatch ft hS
         in do
               case a of
                  []     -> do putStr "Usage:\n
    -w - word-list
    -f - word-frequencies
    -c - check / analyse mail
    -h - learn as ham
    -s - learn as spam"


                  ["-c"] -> do putStr ("X-Pig-Bayes: " 
                                       ++ dec
                                       ++ " "
                                       ++ (show spamRating) 
                                       ++ ", " 
                                       ++ (show hamRating)
				       ++ if (and [spamRating/=0, 
						   hamRating/=0])
					  then ", Ratio: " ++ 
					       show (hamRating/spamRating)
					  else "")

                  ["-f"] -> do 
                                putStr ("X-Pig-Keywords: "
                                        ++ unwords (topFew 50
                                                    (map fst
                                                     (reverse
                                                      (sortBy 
                                                       (\a b -> 
                                                        if (snd a) < (snd b) 
                                                        then LT else GT)
                                                       ft)))))

                  ["-w"] -> do 
                                putStr ("X-Pig-Keywords: "
                                        ++ unwords (topFew 50
                                                    (map fst
                                                     (reverse
                                                      (sortBy 
                                                       (\a b -> 
                                                        if (snd a) < (snd b) 
                                                        then LT else GT)
                                                       wl)))))

                  ["-s"] -> do
                                putStr "learning things as spam"
                                writeFile (home++"/.pigbayes/spam-writing.dat")
                                        (formatPairToString
                                               (relativeScores
                                                    (mergeScores sS ftS)
                                                    hS))
                                renameFile (home++"/.pigbayes/spam-writing.dat")
                                           (home++"/.pigbayes/spam.dat")
                  ["-h"] -> do
                                putStr "learning things as ham"
                                writeFile (home++"/.pigbayes/ham-writing.dat")
                                        (formatPairToString
                                               (relativeScores
                                                    (mergeScores hS ftS)
                                                    sS))
                                renameFile (home++"/.pigbayes/ham-writing.dat")
                                           (home++"/.pigbayes/ham.dat")
               putStr "\n"