{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} -- | Parse TSN XML for the DTD \"newsxml.dtd\". Each document contains -- a root element \ that contains an entire news item. -- module TSN.XML.News ( dtd, has_only_single_sms, pickle_message, -- * Tests news_tests, -- * WARNING: these are private but exported to silence warnings News_LocationConstructor(..), News_TeamConstructor(..), NewsConstructor(..) ) where -- System imports. import Data.Data ( Data, constrFields, dataTypeConstrs, dataTypeOf ) import Data.Time.Clock ( UTCTime ) import Data.List.Utils ( join, split ) import Data.Tuple.Curry ( uncurryN ) import Data.Typeable ( Typeable ) import qualified Data.Vector.HFixed as H ( HVector, convert ) import Database.Groundhog ( countAll, deleteAll, insert_, migrate ) import Database.Groundhog.Core ( DefaultKey ) import Database.Groundhog.Generic ( runDbConn, runMigrationSilent ) import Database.Groundhog.Sqlite ( withSqliteConn ) import Database.Groundhog.TH ( groundhog, mkPersist ) import qualified GHC.Generics as GHC ( Generic ) import Test.Tasty ( TestTree, testGroup ) import Test.Tasty.HUnit ( (@?=), testCase ) import Text.XML.HXT.Core ( PU, XmlTree, (/>), (>>>), addNav, descendantAxis, filterAxis, followingSiblingAxis, hasName, remNav, runLA, xp13Tuple, xpAttr, xpElem, xpInt, xpList, xpOption, xpPair, xpText, xpWrap ) -- Local imports. import TSN.Codegen ( tsn_codegen_config, tsn_db_field_namer ) -- Used in a test import TSN.Database ( insert_or_select ) import TSN.DbImport ( DbImport(..), ImportResult(..), run_dbmigrate ) import TSN.Location ( Location(..), pickle_location ) import TSN.Picklers ( xp_attr_option, xp_time_stamp ) import TSN.Team ( Team(..) ) import TSN.XmlImport ( XmlImport(..) ) import Xml ( FromXml(..), ToDb(..), pickle_unpickle, unpickleable, unsafe_read_invalid_document, unsafe_unpickle ) -- | The DTD to which this module corresponds. Used to invoke dbimport. -- dtd :: String dtd = "newsxml.dtd" -- -- DB/XML Data types -- -- * News/Message -- | The msg_id child of \ contains an event_id attribute; we -- embed it into the 'News' type. We (pointlessly) use the \"db_\" -- prefix here so that the two names don't collide on \"id\" when -- Groundhog is creating its fields using our field namer. -- -- The leading underscores prevent unused field warnings. -- data MsgId = MsgId { _db_msg_id :: Int, _db_event_id :: Maybe Int } deriving (Data, Eq, GHC.Generic, Show, Typeable) -- | For 'H.convert'. -- instance H.HVector MsgId -- | The XML representation of a news item (\). -- data Message = Message { xml_xml_file_id :: Int, xml_heading :: String, xml_mid :: MsgId, xml_category :: String, xml_sport :: String, xml_url :: Maybe String, xml_teams :: [NewsTeamXml], xml_locations :: [Location], xml_sms :: Maybe String, xml_editor :: Maybe String, xml_text :: Maybe String, -- Text and continue seem to show up in pairs, xml_continue :: Maybe String, -- either both present or both missing. xml_time_stamp :: UTCTime } deriving (Eq, GHC.Generic, Show) -- | For 'H.convert'. -- instance H.HVector Message -- | The database representation of a news item. We drop several -- uninteresting fields from 'Message', and omit the list fields which -- will be represented as join tables. -- data News = News { db_xml_file_id :: Int, db_mid :: MsgId, db_sport :: String, db_url :: Maybe String, db_sms :: Maybe String, db_editor :: Maybe String, db_text :: Maybe String, db_continue :: Maybe String, db_time_stamp :: UTCTime } deriving (Data, Eq, Show, Typeable) instance ToDb Message where -- | The database representation of 'Message' is 'News'. type Db Message = News -- | Convert the XML representation 'Message' to the database -- representation 'News'. -- instance FromXml Message where -- | We use a record wildcard so GHC doesn't complain that we never -- used the field names. -- -- To convert, we drop some fields. -- from_xml Message{..} = News { db_xml_file_id = xml_xml_file_id, db_mid = xml_mid, db_sport = xml_sport, db_url = xml_url, db_sms = xml_sms, db_editor = xml_editor, db_text = xml_text, db_continue = xml_continue, db_time_stamp = xml_time_stamp } -- | This lets us insert the XML representation 'Message' directly. -- instance XmlImport Message -- * NewsTeamXml -- | The XML type for teams as they show up in the news. We can't -- reuse the representation from "TSN.Team" because our name doesn't -- appear optional and we have no abbreviation. -- data NewsTeamXml = NewsTeamXml { xml_team_id :: String, xml_team_name :: String } deriving (Eq, GHC.Generic, Show) -- | For 'H.convert'. -- instance H.HVector NewsTeamXml instance ToDb NewsTeamXml where -- | The database representation of 'NewsTeamXml' is 'Team'. type Db NewsTeamXml = Team -- | Convert the XML representation 'NewsTeamXml' to the database -- representation 'Team'. -- instance FromXml NewsTeamXml where from_xml NewsTeamXml{..} = Team { team_id = xml_team_id, abbreviation = Nothing, name = Just xml_team_name } -- | Allow us to import 'NewsTeamXml' directly. -- instance XmlImport NewsTeamXml -- * News_Team -- | Mapping between News records and Team records in the database. We -- don't name the fields because we don't use the names explicitly; -- that means we have to give them nice database names via -- groundhog. -- data News_Team = News_Team (DefaultKey News) (DefaultKey Team) -- * News_Location -- | Mapping between 'News' records and 'Location' records in the -- database. We don't name the fields because we don't use the names -- explicitly; that means we have to give them nice database names -- via groundhog. -- data News_Location = News_Location (DefaultKey News) (DefaultKey Location) -- | Some newsxml documents contain two \ elements in a row, -- violating the DTD. The second one has always been empty, but it's -- irrelevant: we can't parse these, and would like to detect them -- in order to report the fact that the busted document is -- unsupported. -- -- This function detects whether two \ elements appear in a -- row, as siblings. -- has_only_single_sms :: XmlTree -> Bool has_only_single_sms xmltree = case elements of [] -> True _ -> False where parse :: XmlTree -> [XmlTree] parse = runLA $ hasName "/" /> hasName "message" >>> addNav >>> descendantAxis >>> filterAxis (hasName "SMS") >>> followingSiblingAxis >>> remNav >>> hasName "SMS" elements = parse xmltree -- -- * Database code -- -- | Define 'dbmigrate' and 'dbimport' for 'Message's. The import is -- slightly non-generic because of our 'News_Team' and -- 'News_Location' join tables. -- instance DbImport Message where dbmigrate _ = run_dbmigrate $ do migrate (undefined :: Location) migrate (undefined :: News) migrate (undefined :: Team) migrate (undefined :: News_Team) migrate (undefined :: News_Location) dbimport message = do -- Insert the message and acquire its primary key (unique ID) news_id <- insert_xml message -- Now insert the teams. We use insert_xml_or_select because we -- know that most teams will already exist, and we want to get -- back the id for the existing team when there's a collision. team_ids <- mapM insert_xml_or_select (xml_teams message) -- Now that the teams have been inserted, create -- news__team records mapping beween the two. let news_teams = map (News_Team news_id) team_ids mapM_ insert_ news_teams -- Do all of that over again for the Locations. loc_ids <- mapM insert_or_select (xml_locations message) let news_news_locations = map (News_Location news_id) loc_ids mapM_ insert_ news_news_locations return ImportSucceeded -- These types have fields with e.g. db_ and xml_ prefixes, so we -- use our own codegen to peel those off before naming the columns. mkPersist tsn_codegen_config [groundhog| - entity: News constructors: - name: News uniques: - name: unique_news type: constraint # Prevent multiple imports of the same message. fields: [db_xml_file_id] fields: - name: db_mid embeddedType: - {name: msg_id, dbName: msg_id} - {name: event_id, dbName: event_id} - embedded: MsgId fields: - name: _db_msg_id dbName: msg_id - name: _db_event_id dbName: event_id - entity: News_Team dbName: news__teams constructors: - name: News_Team fields: - name: news_Team0 # Default created by mkNormalFieldName dbName: news_id reference: onDelete: cascade - name: news_Team1 # Default created by mkNormalFieldName dbName: teams_id reference: onDelete: cascade - entity: News_Location dbName: news__locations constructors: - name: News_Location fields: - name: news_Location0 # Default created by mkNormalFieldName dbName: news_id reference: onDelete: cascade - name: news_Location1 # Default created by mkNormalFieldName dbName: locations_id reference: onDelete: cascade |] -- -- XML Picklers -- -- | Convert a 'NewsTeamXml' to/from XML. -- pickle_news_team :: PU NewsTeamXml pickle_news_team = xpElem "team" $ xpWrap (from_tuple, H.convert) $ xpPair (xpAttr "id" xpText) xpText -- team name where from_tuple = uncurry NewsTeamXml -- | Convert a 'MsgId' to/from XML. -- pickle_msg_id :: PU MsgId pickle_msg_id = xpElem "msg_id" $ xpWrap (from_tuple, H.convert) $ xpPair xpInt (xpAttr "EventId" xp_attr_option) where from_tuple = uncurryN MsgId -- | Convert a 'Message' to/from XML. -- pickle_message :: PU Message pickle_message = xpElem "message" $ xpWrap (from_tuple, H.convert) $ xp13Tuple (xpElem "XML_File_ID" xpInt) (xpElem "heading" xpText) pickle_msg_id (xpElem "category" xpText) (xpElem "sport" xpText) (xpElem "url" $ xpOption xpText) (xpList pickle_news_team) (xpList pickle_location) (xpElem "SMS" $ xpOption xpText) (xpOption (xpElem "Editor" xpText)) (xpOption (xpElem "text" xpText)) pickle_continue (xpElem "time_stamp" xp_time_stamp) where from_tuple = uncurryN Message -- | We combine all of the \ elements into one 'String' -- while unpickling and do the reverse while pickling. -- pickle_continue :: PU (Maybe String) pickle_continue = xpOption $ xpWrap (to_string, from_string) $ xpElem "continue" $ xpList (xpElem "P" xpText) where from_string :: String -> [String] from_string = split "\n" to_string :: [String] -> String to_string = join "\n" -- -- Tasty Tests -- -- | A list of all tests for this module. -- news_tests :: TestTree news_tests = testGroup "News tests" [ test_news_fields_have_correct_names, test_on_delete_cascade, test_pickle_of_unpickle_is_identity, test_unpickle_succeeds, test_sms_detected_correctly ] -- | Make sure our codegen is producing the correct database names. -- test_news_fields_have_correct_names :: TestTree test_news_fields_have_correct_names = testCase "news fields get correct database names" $ mapM_ check (zip actual expected) where -- This is cool, it uses the (derived) Data instance of -- News.News to get its constructor names. field_names :: [String] field_names = constrFields . head $ dataTypeConstrs $ dataTypeOf (undefined :: News) expected :: [String] expected = map (\x -> tsn_db_field_namer "herp" "derp" 8675309 x 90210) field_names actual :: [String] actual = ["xml_file_id", "mid", "sport", "url", "sms", "editor", "text", "continue"] check (x,y) = (x @?= y) -- | If we unpickle something and then pickle it, we should wind up -- with the same thing we started with. WARNING: success of this -- test does not mean that unpickling succeeded. -- test_pickle_of_unpickle_is_identity :: TestTree test_pickle_of_unpickle_is_identity = testGroup "pickle-unpickle tests" [ check "pickle composed with unpickle is the identity" "test/xml/newsxml.xml", check "pickle composed with unpickle is the identity (with Editor)" "test/xml/newsxml-with-editor.xml", check "pickle composed with unpickle is the identity (empty SMS)" "test/xml/newsxml-empty-sms.xml" ] where check desc path = testCase desc $ do (expected, actual) <- pickle_unpickle pickle_message path actual @?= expected -- | Make sure we can actually unpickle these things. -- test_unpickle_succeeds :: TestTree test_unpickle_succeeds = testGroup "unpickle tests" [ check "unpickling succeeds" "test/xml/newsxml.xml", check "unpickling succeeds (with Editor)" "test/xml/newsxml-with-editor.xml", check "unpickling succeeds (empty SMS)" "test/xml/newsxml-empty-sms.xml" ] where check desc path = testCase desc $ do actual <- unpickleable path pickle_message let expected = True actual @?= expected -- | Make sure everything gets deleted when we delete the top-level -- record. -- test_on_delete_cascade :: TestTree test_on_delete_cascade = testGroup "cascading delete tests" [ check "deleting news deletes its children" "test/xml/newsxml.xml" 4 -- 2 news_teams and 2 news_locations that should remain. , check "deleting news deletes its children (empty SMS)" "test/xml/newsxml-empty-sms.xml" 4 -- 2 news_teams and 2 news_locations ] where check desc path expected = testCase desc $ do news <- unsafe_unpickle path pickle_message let a = undefined :: Location let b = undefined :: News let c = undefined :: Team let d = undefined :: News_Team let e = undefined :: News_Location actual <- withSqliteConn ":memory:" $ runDbConn $ do runMigrationSilent $ do migrate a migrate b migrate c migrate d migrate e _ <- dbimport news deleteAll b count_a <- countAll a count_b <- countAll b count_c <- countAll c count_d <- countAll d count_e <- countAll e return $ count_a + count_b + count_c + count_d + count_e actual @?= expected -- | We want to make sure the single-SMS documents and the multi-SMS -- documents are identified correctly. -- test_sms_detected_correctly :: TestTree test_sms_detected_correctly = testGroup "newsxml SMS count determined correctly" [ check "test/xml/newsxml.xml" "single SMS detected correctly" True, check "test/xml/newsxml-multiple-sms.xml" "multiple SMS detected correctly" False ] where check path desc expected = testCase desc $ do xmltree <- unsafe_read_invalid_document path let actual = has_only_single_sms xmltree actual @?= expected