--------------------------------------------------------------------------------
--  See end of this file for licence information.
--------------------------------------------------------------------------------
-- |
--  Module      :  NTParser
--  Copyright   :  (c) 2011 Douglas Burke
--  License     :  GPL V2
--
--  Maintainer  :  Douglas Burke
--  Stability   :  experimental
--  Portability :  H98
--
--  This Module implements a NTriples parser (see [1]), returning a
--  new 'RDFGraph' consisting of triples and namespace information parsed from
--  the supplied NTriples input string, or an error indication.
--
--  Uses the Parsec monadic parser library.
--
-- REFERENCES:
--
-- 1 <http://www.w3.org/TR/rdf-testcases/#ntriples>
--     RDF Test Cases
--     W3C Recommendation 10 February 2004
--
--------------------------------------------------------------------------------

module Swish.RDF.NTParser
    ( ParseResult
    , parseNT      
    , parsefromString
    
    -- * Exports for parsers that embed NTriples in a bigger syntax
    , NTParser, NTState(..)
    , ntripleDoc
    , line, ws, comment, eoln
    , character, name, triple
    , subject, predicate, object
    , uriref, urirefLbl
    , nodeID, literal, language

    )
where

import Swish.RDF.RDFGraph
    ( RDFGraph, RDFLabel(..)
    , addArc 
    , emptyRDFGraph
    )

import Swish.RDF.GraphClass
    ( arc )

import Swish.Utils.Namespace
    ( ScopedName(..)
    , makeUriScopedName
    )

import Swish.RDF.Vocabulary (langName)

import Swish.RDF.RDFParser
    ( ParseResult, RDFParser
    , ignore
    , annotateParsecError
    )

import Control.Applicative
import Control.Monad (when)

import Network.URI (parseURI)

import Data.Char (chr) 
import Data.Maybe (fromMaybe, isNothing)

import Text.ParserCombinators.Parsec hiding (many, optional, (<|>))

----------------------------------------------------------------------
-- Define parser state and helper functions
----------------------------------------------------------------------

-- | NT parser state
data NTState = NTState
        { graphState :: RDFGraph            -- Graph under construction
        }

--  Return function to update graph in NT parser state,
--  using the supplied function of a graph
--  (use returned function with Parsec updateState)
updateGraph :: ( RDFGraph -> RDFGraph ) -> NTState -> NTState
updateGraph f s = s { graphState = f (graphState s) }

----------------------------------------------------------------------
--  Define top-level parser function:
--  accepts a string and returns a graph or error
----------------------------------------------------------------------

type NTParser a = RDFParser NTState a

-- | Parse a string.
-- 
parseNT ::
  String -- ^ input in NTriples format.
  -> ParseResult
parseNT = parsefromString ntripleDoc
-- parseNT = either Error Result . parsefromString ntripleDoc

{-
-- useful for testing
test :: String -> RDFGraph
test = either error id . parsefromString ntripleDoc
-}

-- | Function to supply initial context and parse supplied term.
--
-- We augment the Parsec error with the context.
--
parsefromString :: 
    NTParser a      -- ^ parser to apply
    -> String       -- ^ input to be parsed
    -> Either String a
parsefromString parser input =
        let
            pstate = NTState
                    { graphState = emptyRDFGraph
                    }
            result = runParser parser pstate "" input
        in
            case result of
                Right res -> Right res
                Left  err -> Left $ annotateParsecError 1 (lines input) err

-- helper routines

fullStop :: NTParser ()
fullStop = ignore (char '.')

{-
lineFeed :: NTParser ()
lineFeed = ignore (char '\r')
-}

-- Add statement to graph in NT parser state

addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> NTParser ()
addStatement s p o = updateState (updateGraph (addArc (arc s p o) ))

----------------------------------------------------------------------
--  Syntax productions
----------------------------------------------------------------------

{-

EBNF from the specification, using the notation from XML 1.0, second edition,
is included inline below.

We do not force ASCII 7-bit semantics here yet.

space	::=	#x20 /* US-ASCII space - decimal 32 */	
cr	::=	#xD /* US-ASCII carriage return - decimal 13 */	
lf	::=	#xA /* US-ASCII line feed - decimal 10 */	
tab	::=	#x9 /* US-ASCII horizontal tab - decimal 9 */	

The productions are kept as close as possible to the specification
for now.

-}

{-
ntripleDoc	::=	line*	
line	::=	ws* ( comment | triple )? eoln	

We relax the rule that the input must be empty or end with a new line.

ntripleDoc :: NTParser RDFGraph
ntripleDoc = graphState <$> (many line *> eof *> getState)

line :: NTParser ()
line = skipMany ws *> optional (comment <|> triple) *> eoln
-}

ntripleDoc :: NTParser RDFGraph
ntripleDoc = graphState <$> (sepBy line eoln *> optional eoln *> skipMany ws *> eof *> getState)

line :: NTParser ()
line = skipMany ws *> ignore (optional (comment <|> triple))

{-
ws	::=	space | tab	

Could use whiteSpace rule here, but that would permit
constructs (e.g. comments) where we do not support them.
-}
ws :: NTParser ()
ws = ignore (char ' ' <|> tab) <?> "white space (' ' or tab)"

{-
comment	::=	'#' ( character - ( cr | lf ) )*	
-}

comment :: NTParser ()
comment = char '#' *> skipMany (noneOf "\r\n") <?> "comment line"

{-
eoln	::=	cr | lf | cr lf	
-}

eoln :: NTParser ()
-- eoln = ignore (newline <|> (lineFeed *> optional newline))
eoln = ignore (try (string "\r\n") <|> string "\r" <|> string "\n")
       <?> "new line"
       
{-
name	::=	[A-Za-z][A-Za-z0-9]*	
-}

hChars, bChars :: String
hChars = ['a'..'z'] ++ ['A'..'Z']
bChars = hChars ++ ['0'..'9']

name :: NTParser String
name = (:) <$> oneOf hChars <*> many (oneOf bChars)

{-
triple	::=	subject ws+ predicate ws+ object ws* '.' ws*	

-}

triple :: NTParser ()
triple = do
  s <- subject
  skipMany1 ws
  p <- predicate
  skipMany1 ws
  o <- object
  skipMany ws
  fullStop
  skipMany ws
  addStatement s p o

{-
subject	::=	uriref | nodeID	
predicate	::=	uriref	
object	::=	uriref | nodeID | literal	
-}

subject :: NTParser RDFLabel
subject = urirefLbl <|> nodeID

predicate :: NTParser RDFLabel
predicate = urirefLbl

object :: NTParser RDFLabel
object = urirefLbl <|> nodeID <|> literal

{-
uriref	::=	'<' absoluteURI '>'	
absoluteURI	::=	character+ with escapes as defined in section URI References	

-}

uriref :: NTParser ScopedName
uriref = do
  ustr <- char '<' *> manyTill character (char '>')
  when (isNothing (parseURI ustr)) $
    fail ("Invalid URI: <" ++ ustr ++ ">")
  return $ makeUriScopedName ustr

urirefLbl :: NTParser RDFLabel
urirefLbl = Res <$> uriref

{-
nodeID	::=	'_:' name	
-}

nodeID :: NTParser RDFLabel
nodeID = Blank <$> (string "_:" *> name) <?> "blank node (_:label)"

{-  
literal	::=	langString | datatypeString	
langString	::=	'"' string '"' ( '@' language )?	
datatypeString	::=	'"' string '"' '^^' uriref	
language	::=	[a-z]+ ('-' [a-z0-9]+ )*
encoding a language tag.	
string	::=	character* with escapes as defined in section Strings	

-}

literal :: NTParser RDFLabel
literal = Lit <$> between (char '"') (char '"') (many character) <*> optionMaybe dtlang

dtlang :: NTParser ScopedName
dtlang = 
    (char '@' *> language)
    <|> (string "^^" *> uriref)

language :: NTParser ScopedName
language = do
  h <- many1 (oneOf ['a'..'z'])
  mt <- optionMaybe ( (:) <$> char '-' <*> many1 (oneOf (['a'..'z'] ++ ['0'..'9'])) )
  return $ langName $ h ++ fromMaybe "" mt

{-
String handling: 

EBNF has:

character	::=	[#x20-#x7E] /* US-ASCII space to decimal 126 */	

Additional information from:

  http://www.w3.org/TR/rdf-testcases/#ntrip_strings

N-Triples strings are sequences of US-ASCII character productions encoding [UNICODE] character strings. The characters outside the US-ASCII range and some other specific characters are made available by \-escape sequences as follows:

 Unicode character
 (with code point u)	N-Triples encoding
 [#x0-#x8]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 #x9	\t
 #xA	\n
 [#xB-#xC]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 #xD	\r
 [#xE-#x1F]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 [#x20-#x21]	the character u
 #x22	\"
 [#x23-#x5B]	the character u
 #x5C	\\
 [#x5D-#x7E]	the character u
 [#x7F-#xFFFF]	\uHHHH
 4 required hexadecimal digits HHHH encoding Unicode character u
 [#10000-#x10FFFF]	\UHHHHHHHH
 8 required hexadecimal digits HHHHHHHH encoding Unicode character u
 where H is a hexadecimal digit: [#x30-#x39],[#x41-#x46] (0-9, uppercase A-F).

This escaping satisfies the [CHARMOD] section Reference Processing Model on making the full Unicode character range U+0 to U+10FFFF available to applications and providing only one way to escape any character.

-}

asciiChars :: String
asciiChars = map chr [0x20..0x7e]

asciiCharsNT :: String
asciiCharsNT = filter (`notElem` "\\\"") asciiChars

ntHexDigit :: NTParser Char
ntHexDigit = oneOf $ ['0'..'9'] ++ ['A'..'F']

hex4 :: NTParser Char
hex4 = do
  digs <- count 4 ntHexDigit
  let dstr = "0x" ++ digs
      dchar = read dstr :: Int
  return $ chr dchar
        
hex8 :: NTParser Char
hex8 = do
  digs <- count 8 ntHexDigit
  let dstr = "0x" ++ digs
      dchar = read dstr :: Int
  if dchar <= 0x10FFFF
    then return $ chr dchar
    else unexpected "\\UHHHHHHHH format is limited to a maximum of \\U0010FFFF"

protectedChar :: NTParser Char
protectedChar =
  (char 't' *> return '\t')
  <|> (char 'n' *> return '\n')
  <|> (char 'r' *> return '\r')
  <|> (char '"' *> return '"')
  <|> (char '\\' *> return '\\')
  <|> (char 'u' *> hex4)
  <|> (char 'U' *> hex8)

character :: NTParser Char
character = (char '\\' *> protectedChar)
      <|> (oneOf asciiCharsNT <?> "ASCII character")

--------------------------------------------------------------------------------
--
--  Copyright (c) 2003, Graham Klyne, 2009 Vasili I Galchin, 2011 Douglas Burke
--  All rights reserved.
--
--  This file is part of Swish.
--
--  Swish is free software; you can redistribute it and/or modify
--  it under the terms of the GNU General Public License as published by
--  the Free Software Foundation; either version 2 of the License, or
--  (at your option) any later version.
--
--  Swish is distributed in the hope that it will be useful,
--  but WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
--  GNU General Public License for more details.
--
--  You should have received a copy of the GNU General Public License
--  along with Swish; if not, write to:
--    The Free Software Foundation, Inc.,
--    59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
--
--------------------------------------------------------------------------------