module Text.PageIO.Index where
import Codec.Text.IConv
import Control.Exception (try)
import Control.Monad (unless)
import Data.Char (isSpace)
import Data.List (nub, sort, intersperse)
import Data.Monoid (mappend)
import Database.SQLite
import System.Environment
import System.Time
import Debug.Trace
import Text.PageIO.Extract
import Text.PageIO.Parser (packLBS)
import Text.PageIO.Transform
import Text.PageIO.Types
import Text.Printf
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.UTF8 as UTF8
import qualified Text.PageIO.LabelMap as LM
addTables dbh = mapM_ (execStatement_ dbh)
[ "CREATE VIRTUAL TABLE doc USING FTS3 ( head TEXT, body TEXT )"
, "CREATE TABLE idx ( pri_nid TEXT, sec_nid TEXT, doc_name INT )"
, "CREATE UNIQUE INDEX doc_name_idx ON idx (doc_name)"
, "CREATE TABLE txt ( body TEXT )"
]
addColumn dbh cols = (`mapM_` cols) $ \col -> mapM_ (execStatement_ dbh)
[ "ALTER TABLE idx ADD COLUMN '" ++ col ++ "' COLLATE NOCASE"
, "CREATE INDEX '" ++ col ++ "_idx' ON idx ('" ++ col ++ "' COLLATE NOCASE)"
]
indexDocs :: Sheet -> [Doc] -> IO ()
indexDocs MkSheet{ sheetName, sheetFields, sheetFrames, sheetBox = MkBox{ boxRight, boxBottom } } docs = do
env <- getEnvironment
let dbName = case lookup "PIO_DB" env of
Just n -> n
_ -> LM.fromLabel sheetName
putStr dbName
dbh <- openConnection $ dbName ++ ".db"
addTables dbh
addColumn dbh columns
dbhDetails <- case lookup "PIO_DETAILS_DB" env of
Just n -> do
h <- openConnection $ n ++ ".db"
addTables h
addColumn h ("fulltext":detailColumns)
execStatement_ h "BEGIN EXCLUSIVE TRANSACTION"
return (Just h)
_ -> return Nothing
execStatement_ dbh "BEGIN EXCLUSIVE TRANSACTION"
(`mapM_` ([1..] `zip` docs)) $ \(docIndex, MkDoc meta contents) -> try $ do
putStr "."
let fields = resultFields meta
body = UTF8.toString . packLBS $ convert "CP950" "UTF-8" contents
let valOf x = LM.lookup (LM.toLabel x) fields
year = case valOf "year_roc" of
Just year_roc -> valToInt year_roc + 11
_ -> case valOf "year" of
Just year -> valToInt year
_ -> 0
r_date | year == 0 = "0"
| otherwise =
let month = maybe 1 valToInt (valOf "month")
day = maybe 1 valToInt (valOf "day")
in fromYMD (fixY2K year) month day
fixY2K year | year >= 1900 = year
| year >= 70 = 1900 + year
| otherwise = 2000 + year
let attrsVanilla =
[ (LM.fromLabel lbl, dropWhile isSpace (decode val))
| (lbl, val) <- LM.toList fields
]
decode bound = UTF8.toString . packLBS $ convert "CP950" "UTF-8" (L.fromChunks [bound])
attrs = (("r_id", LM.fromLabel sheetName):("r_date", r_date):attrsVanilla)
attrHead = unlines (map snd attrs)
cols = concatMap ((++ "'") . (", '" ++) . fst) attrs
prms = concatMap ((++ "'") . (", '" ++) . snd) attrs
execStatement_ dbh $ concat
[ "INSERT INTO doc (head, body) VALUES ('"
, attrHead
, "', '"
, body
, "')"
]
execStatement_ dbh $ concat
[ "INSERT INTO idx (doc_name, pri_nid, sec_nid"
, cols
, ") VALUES (last_insert_rowid(), "
, show boxRight
, ", "
, show boxBottom
, prms
, ")"
]
let blocks = LM.elems $ resultBlocks meta
case dbhDetails of
Just h -> do
execStatement_ h $ concat
[ "INSERT INTO txt (rowid, body) VALUES ("
, show docIndex
, ", '"
, body
, "')"
]
(`mapM_` foldl blockProduct [] blocks) $ \(area, vals) -> try $ unless (LM.null vals) $ do
let attrsBlock =
[ (LM.fromLabel lbl, dropWhile isSpace (decode val))
| (lbl, val) <- LM.toList vals
]
attrs' = attrs ++ map maybeFix attrsBlock
attrHead' = unlines (map snd attrs')
cols' = concatMap ((++ "'") . (", '" ++) . fst) attrs'
prms' = concatMap ((++ "'") . (", '" ++) . snd) attrs'
maybeFix ("expiry_date", date) = ("expiry_date", parseDate date)
maybeFix (c@['F','D','S','D',x,'D'], date@(_:_))
| x == 'B' || x == 'E' || x == 'O'
= (c, parseDate date)
maybeFix x = x
parseDate date = fromYMD y m d
where
i = read date
y = 1911 + (i `div` 10000)
m = i `mod` 10000 `div` 100
d = i `mod` 100
execStatement_ h $ concat
[ "INSERT INTO doc (head, body) VALUES ('"
, attrHead'
, "', '"
, concatMap decode (pageLines area)
, "')"
]
execStatement_ h $ concat
[ "INSERT INTO idx (doc_name, fulltext, pri_nid, sec_nid"
, cols'
, ") VALUES (last_insert_rowid(), '"
, show docIndex
, "', "
, show boxRight
, ", "
, show boxBottom
, prms'
, ")"
]
return ()
_ -> return ()
execStatement_ dbh "COMMIT"
case dbhDetails of
Just h -> execStatement_ h "COMMIT"
_ -> return Nothing
closeConnection dbh
putStrLn "done!"
where
columns = nub $ sort ("r_date":"r_id":map LM.fromLabel (LM.keys sheetFields))
detailColumns = nub $ sort ("r_date":"r_id":map LM.fromLabel (concatMap LM.keys (sheetFields : frameFields)))
frameFields = concatMap (map blockFields . LM.elems . frameBlocks) sheetFrames
blockProduct :: [(Area, LabelMap Bound)] -> BlockResult -> [(Area, LabelMap Bound)]
blockProduct [] (MkBlockResult ys) = ys
blockProduct xs (MkBlockResult []) = xs
blockProduct xs (MkBlockResult ys) =
[ (xa `mappend` ya, xb `mappend` yb)
| (xa, xb) <- xs
, (ya, yb) <- ys
]
fromYMD :: Int -> Int -> Int -> String
fromYMD y m d = case toClockTime cal of
TOD sec _ -> show (succ (sec `div` 86400))
where
cal = CalendarTime
{ ctYear = y
, ctMonth = toEnum (m1)
, ctDay = d
, ctHour = 0
, ctMin = 0
, ctSec = 0
, ctPicosec = 0
, ctWDay = Sunday
, ctYDay = 0
, ctTZName = "UTC"
, ctTZ = 0
, ctIsDST = False
}