{-|
Module      : Data.RDF.Parser.NQuads
Description : Representation and Incremental Processing of RDF Data
Copyright   : Travis Whitaker 2016
License     : MIT
Maintainer  : pi.boy.travis@gmail.com
Stability   : Provisional
Portability : Portable

A parser for <https://www.w3.org/TR/2014/REC-n-quads-20140225/ RDF 1.1 N-Quads>.
-}

{-# LANGUAGE OverloadedStrings #-}

module Data.RDF.Parser.NQuads (
    Result
  , parseNQuads
  , parseTriple
  , parseQuad
  , parseQuadLine
  , foldGraphs
  , foldResults
  ) where

import qualified Data.Attoparsec.Text      as A
import qualified Data.Attoparsec.Text.Lazy as AL

import Data.RDF.Types
import Data.RDF.Parser.Common

import qualified Data.Text.Lazy as TL

-- | Either an 'RDFGraph' or a parse error.
type Result = Either String RDFGraph

-- | A parser for
--   <https://www.w3.org/TR/2014/REC-n-quads-20140225/ RDF 1.1 N-Quads>. This
--   parser works incrementally by first lazily splitting the input into lines,
--   then parsing each line of the N-Quads document individually. This allows
--   for incremental processing in constant space, as well as extracting any
--   valid data from an N-Quads document that contains some invalid quads.
--   'TL.Text' is used because the RDF 1.1 specification stipulates that RDF
--   should always be encoded with Unicode.
--
--   Due to its incremental nature, this parser will accept some N-Quads
--   documents that are not legal according to the RDF 1.1 specification.
--   Specifically, this parser will provide duplicate 'Triple's if they exist in
--   the input N-Quads document; a proper graph consists of true sets of nodes
--   and edges, i.e. no duplicate nodes or edges. Any downstream program
--   incrementally consuming this parser's output should take care to ignore any
--   supernumerary triples.
--
--   Likewise, if a graph's constituent triples are not contiguous in the input
--   N-Quads document, then they will not be folded into contiguous 'RDFGraph's
--   in this parser's output. Any downstream program incrementally consuming
--   this parser's output and performing graph processing that discriminates
--   based on graph labels will not necessarily be presented each contiguous
--   labeled graph as a single 'RDFGraph' record. For example, something like
--   this could be used to lazily find all 'RDFGraph' records containing a named
--   graph's 'Triple's. Downstream processing must then be able to handle a
--   single named graph spanning multiple 'RDFGraph' records.
--
-- > filterGraph :: (Maybe IRI) -> [RDFGraph] -> [RDFGraph]
-- > filterGraph gl = filter (\g -> (graphLabel g) == gl)
parseNQuads :: TL.Text -> [Result]
parseNQuads = foldResults
            . map (AL.eitherResult . AL.parse parseQuad)
            . TL.lines

-- | Fold a list of 'Quad's into a list of 'RDFGraph's, where adjacent 'Quad's
--   in the input are included in the same 'RDFGraph'.
foldGraphs :: [Quad] -> [RDFGraph]
foldGraphs [] = []
foldGraphs (quad:quads) = go (RDFGraph (quadGraph quad) [quadTriple quad]) quads
    where go g [] = [g]
          go g@(RDFGraph gl ts) (q:qs)
                | gl == quadGraph q = go (RDFGraph gl (quadTriple q:ts)) qs
                | otherwise         = g : go (RDFGraph (quadGraph q)
                                                       [quadTriple q]) qs

-- | Fold a list of parsed 'Quad's into a list of parsed 'RDFGraph's, where
--   adjacent 'Quad's in the input are included in the same 'RDFGraph'.
foldResults :: [Either String Quad] -> [Result]
foldResults [] = []
foldResults (Left e:quads)     = Left e : foldResults quads
foldResults (Right quad:quads) = go (RDFGraph (quadGraph quad)
                                               [quadTriple quad])
                                     quads
    where go g []            = [Right g]
          go g (Left e:qs) = Right g : Left e : foldResults qs
          go g@(RDFGraph gl ts) (Right q:qs)
                | gl == quadGraph q = go (RDFGraph gl (quadTriple q:ts)) qs
                | otherwise         = Right g : go (RDFGraph (quadGraph q)
                                                   [quadTriple q]) qs

-- | Parse a single N-Quads 'Triple'.
parseTriple :: A.Parser Triple
parseTriple = Triple <$> (parseSubject <* A.skipSpace)
                     <*> (parsePredicate <* A.skipSpace)
                     <*> parseObject

-- | Parse a single N-Quads 'Quad'.
parseQuad :: A.Parser Quad
parseQuad = Quad <$> parseTriple
                 <*> ((A.skipSpace *> parseGraphLabel) <*
                     (A.skipSpace *> A.char '.'))

-- | Parse a single N-Quads 'Quad' on its own line. This parser is suitable for
--   using Attoparsec's incremental input mechanism 'parse'/'feed' instead of a
--   lazy 'T.Text'.
parseQuadLine :: A.Parser Quad
parseQuadLine = parseQuad <* A.char '\n'