{-
    Kaya - My favourite toy language.
    Copyright (C) 2004, 2005 Edwin Brady

    This file is distributed under the terms of the GNU General
    Public Licence. See COPYING for licence.
-}

module Lexer where

import Char
import Language
import Debug.Trace

data ParseOpts = ParseOpts {
		    libdirs :: [FilePath]
			   }

type LineNumber = Int
type Vars = Int -- Number of tmp variables we've introduced

type P a = String -> String -> [FilePath] -> Name -> LineNumber -> Vars -> 
           Result a

getLibdirs :: P [FilePath]
getLibdirs = \s fn ld mod l v -> Success ld

getLineNo :: P LineNumber
getLineNo = \s fn ld mod l v -> Success l

getFileName :: P String
getFileName = \s fn ld mod l v -> Success fn

getContent :: P String
getContent = \s fn ld mod l v -> Success s

getModuleName :: P Name
getModuleName = \s fn ld mod l v -> Success mod

getNumVars :: P Vars
getNumVars = \s fn ld mod l v -> Success v

setLineNo :: LineNumber -> P a -> P a
setLineNo i p = \s fn ld mod l v -> p s fn ld mod i v

--incVars :: P a
--incVars = \s fn ld mod l v -> p s fn ld mod l (v+1)

thenP :: P a -> (a -> P b) -> P b
m `thenP` k = \s fn ld mod l v ->
   case m s fn ld mod l v of 
      -- easiest just to keep incrementing the variable number here...
       Success a -> k a s fn ld mod l (v+1)
       Failure e f ln -> Failure e f ln

returnP :: a -> P a
returnP a = \s fn ld mod l v -> Success a

failP :: String -> P a
failP err = \s fn ld mod l v -> Failure err fn l

catchP :: P a -> (String -> P a) -> P a
catchP m k = \s fn ld mod l v ->
   case m s fn ld mod l v of
      Success a -> Success a
      Failure e f ln -> k e s fn ld mod l v

data Token = TokenInt Int
	   | TokenReal Double
           | TokenName Name
           | TokenString String
	   | TokenBool Bool
	   | TokenChar Char
	   | TokenMetaVar Int
           | TokenUnderscore
           | TokenEq
           | TokenPlus
           | TokenMinus
           | TokenTimes
           | TokenDiv
           | TokenPower
           | TokenMod
	   | TokenAnd
	   | TokenOr
	   | TokenAndBool
	   | TokenOrBool
	   | TokenXOR
	   | TokenShLeft
	   | TokenShRight
	   | TokenInc
	   | TokenDec
	   | TokenIncBy
	   | TokenDecBy
	   | TokenMultBy
	   | TokenDivBy
           | TokenIntEq
           | TokenIntNE
           | TokenOB
           | TokenCB
           | TokenOCB
           | TokenCCB
           | TokenOSB
           | TokenCSB
           | TokenLT
           | TokenGT
           | TokenLE
           | TokenGE
	   | TokenNot
           | TokenAssign
	   | TokenColon
	   | TokenAt
	   | TokenAtBracket
	   | TokenHashBracket
	   | TokenBacktick
	   | TokenSemiColon
	   | TokenComma
	   | TokenHash
	   | TokenDot
	   | TokenDots
	   | TokenColons
--	   | TokenSize
	   | TokenData
	   | TokenAbstract
	   | TokenType
	   | TokenVar
	   | TokenIntType
	   | TokenCharType
	   | TokenBoolType
	   | TokenRealType
	   | TokenStringType
	   | TokenPointerType
	   | TokenVoidType
	   | TokenReturn
	   | TokenWhile
	   | TokenDo
	   | TokenRepeat
	   | TokenFor
	   | TokenBreak
	   | TokenPass
	   | TokenTry
	   | TokenCatch
	   | TokenFinally
	   | TokenThrow
	   | TokenException
	   | TokenNewException
	   | TokenExtException
	   | TokenNewTry
           | TokenMatch
           | TokenWith
	   | TokenTo
	   | TokenIn
	   | TokenCase
	   | TokenOf
           | TokenLet
	   | TokenIf
	   | TokenElse
	   | TokenEnd
	   | TokenPrint
--	   | TokenInputStr
--	   | TokenInputNum
	   | TokenArrow
	   | TokenInclude
	   | TokenImport
	   | TokenModule
	   | TokenProgram
	   | TokenWebapp
	   | TokenWebprog
	   | TokenShebang
	   | TokenExtern
	   | TokenExternInline
	   | TokenCycleExtern
           | TokenFunMap
	   | TokenLifted
	   | TokenExtData
	   | TokenImported
	   | TokenDocstring
           | TokenDeprecated
	   | TokenLink
	   | TokenLine
	   | TokenFile
           | TokenTest
	   | TokenVM
	   | TokenLength
           | TokenStartup
	   | TokenGlobal
	   | TokenPublic
	   | TokenPrivate
	   | TokenPure
	   | TokenDefault
	   | TokenLambda
	   | TokenDatacon
	   | TokenDatatype
	   | TokenExtType
	   | TokenCInclude
	   | TokenForeign
	   | TokenApp
	   | TokenFNid
	   | TokenEOF
   deriving (Show, Eq)

lexer :: (Token -> P a) -> P a
lexer cont [] = cont TokenEOF []
lexer cont ('\n':cs) = \fn ld mod line v -> lexer cont cs fn ld mod (line+1) v
lexer cont ('r':'"':cs) = lexString True cont cs
lexer cont ('R':'"':cs) = lexString True cont cs
lexer cont ('r':'\'':cs) = lexChar True cont cs
lexer cont ('R':'\'':cs) = lexChar True cont cs
lexer cont (c:cs) 
      | isSpace c = \fn ld mod line v -> lexer cont cs fn ld mod line v
      | isAlpha c = lexVar cont (c:cs)
      | isDigit c = lexNum cont (c:cs)
      | c == '_' = lexVar cont (c:cs)
lexer cont ('$':c:cs) | isDigit c = lexMetaVar cont (c:cs)
lexer cont ('/':'*':cs) = lexerEatComment 0 cont cs
lexer cont ('/':'/':cs) = lexerEatToNewline cont cs
lexer cont ('%':cs) = lexSpecial cont cs
lexer cont ('"':cs) = lexString False cont cs
lexer cont ('\'':cs) = lexChar False cont cs
lexer cont ('+':'+':cs) = cont TokenInc cs
lexer cont ('-':'-':cs) = cont TokenDec cs
lexer cont ('+':'=':cs) = cont TokenIncBy cs
lexer cont ('-':'=':cs) = cont TokenDecBy cs
lexer cont ('*':'=':cs) = cont TokenMultBy cs
lexer cont ('/':'=':cs) = cont TokenDivBy cs
lexer cont ('+':cs) = cont TokenPlus cs
lexer cont ('-':'>':cs) = cont TokenArrow cs
lexer cont ('-':cs) = cont TokenMinus cs
--lexer cont (':':'=':cs) = cont TokenAssign cs
lexer cont ('=':'=':cs) = cont TokenIntEq cs
lexer cont ('#':'(':cs) = cont TokenHashBracket cs
lexer cont ('!':'=':cs) = cont TokenIntNE cs
lexer cont ('!':cs) = cont TokenNot cs
lexer cont ('=':cs) = cont TokenEq cs
lexer cont ('*':'*':cs) = cont TokenPower cs
lexer cont ('*':cs) = cont TokenTimes cs
lexer cont ('@':'(':cs) = cont TokenAtBracket cs
lexer cont ('@':cs) = cont TokenAt cs
lexer cont ('/':cs) = cont TokenDiv cs
lexer cont ('&':'&':cs) = cont TokenAndBool cs
lexer cont ('|':'|':cs) = cont TokenOrBool cs
lexer cont ('&':cs) = cont TokenAnd cs
lexer cont ('|':cs) = cont TokenOr cs
lexer cont ('^':cs) = cont TokenXOR cs
lexer cont ('(':cs) = cont TokenOB cs
lexer cont (')':cs) = cont TokenCB cs
lexer cont ('{':cs) = cont TokenOCB cs
lexer cont ('}':cs) = cont TokenCCB cs
lexer cont ('[':cs) = cont TokenOSB cs
lexer cont (']':cs) = cont TokenCSB cs
lexer cont ('<':'<':cs) = cont TokenShLeft cs
lexer cont ('>':'>':cs) = cont TokenShRight cs
lexer cont ('<':'=':cs) = cont TokenLE cs
lexer cont ('>':'=':cs) = cont TokenGE cs
lexer cont ('<':cs) = cont TokenLT cs
lexer cont ('>':cs) = cont TokenGT cs
lexer cont (':':':':cs) = cont TokenColons cs
--lexer cont (':':cs) = cont TokenColon cs
lexer cont (';':cs) = cont TokenSemiColon cs
lexer cont (',':cs) = cont TokenComma cs
--lexer cont ('#':cs) = cont TokenHash cs
lexer cont ('.':'.':cs) = cont TokenDots cs
lexer cont ('.':cs) = cont TokenDot cs
lexer cont ('`':cs) = cont TokenBacktick cs
lexer cont ('\\':cs) = cont TokenLambda cs
lexer cont ('#':'!':cs) = cont TokenShebang (stripToNL cs)
   where stripToNL ('\n':cs) = cs
	 stripToNL (x:cs) = stripToNL cs
lexer cont (c:cs) = lexError c cs

lexError c s l = failP (show l ++ ": Unrecognised token '" ++ [c] ++ "'\n") s l

lexerEatComment nls cont ('*':'/':cs) 
    = \fn ld mod line v -> lexer cont cs fn ld mod (line+nls) v
lexerEatComment nls cont ('\n':cs) = lexerEatComment (nls+1) cont cs
lexerEatComment nls cont (c:cs) = lexerEatComment nls cont cs
lexerEatToNewline cont ('\n':cs) 
   = \fn ld mod line v -> lexer cont cs fn ld mod (line+1) v
lexerEatToNewline cont (c:cs) = lexerEatToNewline cont cs

lexNum cont cs = cont tok rest
  where (num,rest,isreal) = readNum cs
	tok | isreal = TokenReal (read num)
	    | otherwise = TokenInt (read num)

lexMetaVar cont cs = cont (TokenMetaVar (read num)) rest
      where (num,rest) = span isDigit cs

readNum :: String -> (String,String,Bool)
readNum x = rn' False "" x
  where rn' dot acc [] = (acc,[],dot)
	rn' False acc ('.':xs) | head xs /= '.' = rn' True (acc++".") xs
	rn' dot acc (x:xs) | isDigit x = rn' dot (acc++[x]) xs
	rn' dot acc ('e':'+':xs) = rn' True (acc++"e+") xs
	rn' dot acc ('e':'-':xs) = rn' True (acc++"e-") xs
	rn' dot acc ('e':xs) = rn' True (acc++"e") xs
	rn' dot acc xs = (acc,xs,dot)

lexString raw cont cs = 
   \fn ld mod line v ->
   case getstr raw cs of
      Just (str,rest,nls) -> cont (TokenString str) rest fn ld mod (nls+line) v
      Nothing -> failP (fn++":"++show line++":Unterminated string constant")
                    cs fn ld mod line v

lexChar raw cont cs = 
   \fn ld mod line v ->
   case getchar raw cs of
      Just (str,rest) -> cont (TokenChar str) rest fn ld mod line v
      Nothing -> failP (fn++":"++show line++":Unterminated character constant")
		       cs fn ld mod line v

isAllowed c = isAlpha c || isDigit c || c `elem` "_\'?"

lexVar cont cs =
   case span isAllowed cs of
-- Keywords
      ("data",rest) -> cont TokenData rest
      ("abstract",rest) -> cont TokenAbstract rest
      ("type",rest) -> cont TokenType rest
      ("var",rest) -> cont TokenVar rest
      ("Int",rest) -> cont TokenIntType rest
      ("Char",rest)  -> cont TokenCharType rest
      ("Bool",rest)  -> cont TokenBoolType rest
      ("Float",rest)  -> cont TokenRealType rest
      ("String",rest) -> cont TokenStringType rest
      ("Ptr",rest) -> cont TokenPointerType rest
      ("Void",rest)  -> cont TokenVoidType rest
      ("return",rest)  -> cont TokenReturn rest
      ("foreign",rest) -> cont TokenForeign rest
      ("while",rest)  -> cont TokenWhile rest
      ("do",rest)  -> cont TokenDo rest
      ("repeat",rest)  -> cont TokenRepeat rest
      ("for",rest) -> cont TokenFor rest
      ("break",rest) -> cont TokenBreak rest
      ("pass",rest) -> cont TokenPass rest
      ("try",rest) -> cont TokenNewTry rest
      ("throw",rest) -> cont TokenThrow rest
      ("catch",rest) -> cont TokenCatch rest
      ("finally",rest) -> cont TokenFinally rest
      ("Exception",rest) -> cont TokenNewException rest
--      ("to",rest) -> cont TokenTo rest
      ("in",rest) -> cont TokenIn rest
      ("case",rest) -> cont TokenCase rest
      ("of",rest) -> cont TokenOf rest
      ("let",rest) -> cont TokenLet rest
--      ("end",rest) -> cont TokenEnd rest
      ("trace",rest)  -> cont TokenPrint rest
--      ("readInt",rest)  -> cont TokenInputNum rest
--      ("readStr",rest)  -> cont TokenInputStr rest
      ("if",rest) -> cont TokenIf rest
      ("else",rest) -> cont TokenElse rest
      ("true",rest) -> cont (TokenBool True) rest
      ("default",rest) -> cont (TokenDefault) rest
      ("false",rest) -> cont (TokenBool False) rest
--      ("size",rest) -> cont TokenSize rest
--      ("CINCLUDE",rest) -> cont TokenCInclude rest
      ("include",rest) -> cont TokenInclude rest
      ("import",rest) -> cont TokenImport rest
--      ("program",rest) -> cont TokenProgram rest
      ("module",rest) -> cont TokenModule rest
--      ("webapp",rest) -> cont TokenWebapp rest
--      ("webprog",rest) -> cont TokenWebprog rest
--      ("fnid",rest) -> cont TokenFNid rest
      ("globals",rest) -> cont TokenGlobal rest
      ("lambda",rest) -> cont TokenLambda rest
      ("public",rest) -> cont TokenPublic rest
      ("private",rest) -> cont TokenPrivate rest
      ("pure",rest) -> cont TokenPure rest
      ("extern",rest) -> cont TokenCycleExtern rest
      ("operator",rest) -> overload cont rest
      (var,rest)   -> cont (mkname var) rest

overload cont ('*':'*':rest) = cont (TokenName (OP Power)) rest

overload cont ('=':'=':rest) = cont (TokenName (OP Equal)) rest
overload cont ('!':'=':rest) = cont (TokenName (OP NEqual)) rest
overload cont ('<':'=':rest) = cont (TokenName (OP OpLE)) rest
overload cont ('>':'=':rest) = cont (TokenName (OP OpGE)) rest
overload cont ('<':rest) = cont (TokenName (OP OpLT)) rest
overload cont ('>':rest) = cont (TokenName (OP OpGT)) rest

overload cont ('&':'&':rest) = cont (TokenName (OP OpAndBool)) rest
overload cont ('|':'|':rest) = cont (TokenName (OP OpOrBool)) rest

overload cont ('&':rest) = cont (TokenName (OP OpAnd)) rest
overload cont ('|':rest) = cont (TokenName (OP OpOr)) rest

overload cont ('+':rest) = cont (TokenName (OP Plus)) rest
overload cont ('-':rest) = cont (TokenName (OP Minus)) rest
overload cont ('*':rest) = cont (TokenName (OP Times)) rest
overload cont ('/':rest) = cont (TokenName (OP Divide)) rest
overload cont ('%':rest) = cont (TokenName (OP Modulo)) rest
overload cont (x:xs) = lexError x xs

lexSpecial cont cs =
    case span isAllowed cs of
      ("extern",rest) -> cont TokenExtern rest
      ("extinline",rest) -> cont TokenExternInline rest
      ("fnmap",rest) -> cont TokenFunMap rest
      ("lifted",rest) -> cont TokenLifted rest
      ("data",rest) -> cont TokenExtData rest
      ("datacon",rest) -> cont TokenDatacon rest
      ("datatype",rest) -> cont TokenDatatype rest
      ("type",rest) -> cont TokenExtType rest
      ("doc",rest) -> cont TokenDocstring rest
      ("imported",rest) -> cont TokenImported rest
      ("deprecated",rest) -> cont TokenDeprecated rest
      ("include",rest) -> cont TokenCInclude rest
      ("link",rest) -> cont TokenLink rest
      ("line",rest) -> setLine cont rest
      ("file",rest) -> setFile cont rest
      ("test",rest) -> cont TokenTest rest
      ("VM",rest) -> cont TokenVM rest
      ("length",rest) -> cont TokenLength rest
      ("startup",rest) -> cont TokenStartup rest
      ("Exception",rest) -> cont TokenException rest
      ("except",rest) -> cont TokenExtException rest
      ("try",rest) -> cont TokenTry rest
      ("case",rest) -> cont TokenMatch rest
      ("of",rest) -> cont TokenWith rest
      (thing,rest) -> cont TokenMod (thing++rest)

setLine cont (x:xs) 
    | isSpace x = setLine cont xs
    | isDigit x = case span isDigit (x:xs) of
                    (num, rest) -> 
                        \fn ld mod line v -> lexer cont rest fn ld mod (read num) v
    | otherwise = \fn ld mod line v ->
                     failP (fn++":"++show line++":Invalid %line directive")
		       xs fn ld mod line v

setFile cont (x:xs) 
    | isSpace x = setFile cont xs
    | x == '"'  = case getstr False xs of
                    (Just (str, rest, nls)) -> 
                      \fn ld mod line v -> lexer cont rest str ld mod (line+nls) v
                    Nothing -> \fn ld mod line v ->
                     failP (fn++":"++show line++":Invalid %file directive")
		       xs fn ld mod line v
    | otherwise = \fn ld mod line v ->
                     failP (fn++":"++show line++":Invalid %file directive")
		       xs fn ld mod line v

mkname :: String -> Token
mkname "_" = TokenUnderscore
mkname c = TokenName (UN c)

getstr :: Bool ->  -- if True, don't process escapes
          String -> Maybe (String,String,Int)
getstr raw cs = case getstr' raw "" cs 0 of
                  Just (str,rest,nls) -> Just (reverse str,rest,nls)
                  _ -> Nothing
getstr' _ acc ('\"':xs) = \nl -> Just (acc,xs,nl)
getstr' False acc ('\\':'n':xs) = getstr' False ('\n':acc) xs -- Newline
getstr' False acc ('\\':'r':xs) = getstr' False ('\r':acc) xs -- CR
getstr' False acc ('\\':'t':xs) = getstr' False ('\t':acc) xs -- Tab
getstr' False acc ('\\':'b':xs) = getstr' False ('\b':acc) xs -- Backspace
getstr' False acc ('\\':'a':xs) = getstr' False ('\a':acc) xs -- Alert
getstr' False acc ('\\':'f':xs) = getstr' False ('\f':acc) xs -- Formfeed
getstr' False acc ('\\':'0':xs) = getstr' False ('\0':acc) xs -- null
getstr' False acc ('\\':x:xs) = getstr' False (x:acc) xs -- Literal
-- You can always escape double quotes, even in a raw string
getstr' raw acc ('\\':'"':xs) = getstr' raw ('\"':acc) xs -- Quote
getstr' False acc ('\n':xs) = 
    \nl -> getstr' False ('\n':acc) xs (nl+1) -- Count the newline
getstr' raw acc (x:xs) = getstr' raw (x:acc) xs
getstr' _ _ _ = \nl -> Nothing

getchar :: Bool -> -- if True, don't process escapes
           String -> Maybe (Char,String)
getchar False ('\\':'n':'\'':xs) = Just ('\n',xs) -- Newline
getchar False ('\\':'r':'\'':xs) = Just ('\r',xs) -- CR
getchar False ('\\':'t':'\'':xs) = Just ('\t',xs) -- Tab
getchar False ('\\':'b':'\'':xs) = Just ('\b',xs) -- Backspace
getchar False ('\\':'a':'\'':xs) = Just ('\a',xs) -- Alert
getchar False ('\\':'f':'\'':xs) = Just ('\f',xs) -- Formfeed
getchar False ('\\':'0':'\'':xs) = Just ('\0',xs) -- null
getchar False ('\\':x:'\'':xs) = Just (x,xs) -- Literal
getchar _ (x:'\'':xs) = Just (x,xs)
getchar _ _ = Nothing
