{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE RankNTypes #-} -- | -- Module: BDCS.Import.NPM -- Copyright: (c) 2017 Red Hat, Inc. -- License: LGPL -- -- Maintainer: https://github.com/weldr -- Stability: alpha -- Portability: portable -- -- Functions for importing NPM packages into the database module BDCS.Import.NPM(loadFromURI) where import Control.Monad(void) import Control.Monad.Catch(MonadThrow) import Control.Monad.Except(MonadError, runExceptT, throwError) import Control.Monad.IO.Class(MonadIO, liftIO) import Control.Monad.Reader(ReaderT, ask) import Control.Monad.State(MonadState, get, modify) import Control.Monad.Trans.Resource(MonadBaseControl, MonadResource) import Data.Aeson(FromJSON(..), Object, Value(..), (.:), (.:?), (.!=), eitherDecode, withObject, withText) import Data.Aeson.Types(Parser, typeMismatch) import Data.Bits((.|.)) import Data.ByteArray(convert) import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as S8 import qualified Data.ByteString.Lazy as BSL import Data.Conduit(Conduit, Consumer, ZipConduit(..), (.|), getZipConduit, runConduitRes, toConsumer, yield) import Data.Conduit.Binary(sinkLbs) import qualified Data.Conduit.Combinators as CC import qualified Data.Conduit.List as CL import Data.Conduit.Lift(evalStateLC) import qualified Data.Conduit.Tar as CT import Data.ContentStore(ContentStore, storeByteStringSink) import Data.ContentStore.Digest(ObjectDigest) import qualified Data.HashMap.Lazy as HM import Data.List(isPrefixOf) import Data.Maybe(fromMaybe) import qualified Data.Text as T import Data.Text.Encoding(decodeUtf8) import Database.Persist.Sql(SqlPersistT) import Network.URI(URI(..), URIAuth(..), nullURI, parseURI, relativeTo) import System.FilePath((), makeRelative, normalise, takeDirectory, takeFileName) import System.Posix.Files(blockSpecialMode, characterSpecialMode, directoryMode, namedPipeMode, regularFileMode, symbolicLinkMode) import Text.Regex.PCRE((=~)) import BDCS.Build.NPM(rebuildNPM) import BDCS.DB import BDCS.Files(associateFilesWithSource, insertFiles) import BDCS.Import.Conduit(getFromURI, ungzipIfCompressed) import BDCS.Import.State(ImportState(..)) import BDCS.KeyType import BDCS.Projects(insertProject) import BDCS.Sources(insertSource, insertSourceKeyValue) import BDCS.Utils.Either(whenLeft) import BDCS.Utils.Monad((>>?)) -- base URI for the package.json information npmRegistry :: URI npmRegistry = URI {uriScheme = "https:", uriAuthority = Just URIAuth{uriUserInfo = "", uriRegName = "registry.npmjs.org", uriPort = ""}, uriPath = "/", uriQuery = "", uriFragment = "" } -- The data returned by the registry contains a lot of the same things as package.json, -- but it may not actually match the real data in package.json. The only part we can -- use is information in the "dist" object. data PackageDist = PackageDist { _integrity :: Maybe T.Text, _shasum :: Maybe T.Text, tarball :: T.Text } deriving(Show) instance FromJSON PackageDist where parseJSON = withObject "registry JSON" $ \v -> case HM.lookup "dist" v of Nothing -> fail "Missing dist object" Just (Object d) -> PackageDist <$> d .:? "integrity" <*> d .:? "shasum" <*> d .: "tarball" Just _ -> fail "Dist not an object" -- the parts of package.json we care about data PackageJSON = PackageJSON { packageName :: T.Text, packageVersion :: T.Text, description :: T.Text, homepage :: Maybe T.Text, license :: T.Text, -- This can show up in package.json in two ways: either as the map of executable -- names to js paths ("bin" : { "exec1": "./script1.js", "exec2": "./script2.js" }), -- or as a single string ("bin": "./script1.js"). The single string case should be -- interpreted as the path to an executable that should be named the same as the name -- of the package. bin :: Maybe [(T.Text, T.Text)], -- This can appear as either a list of strings or a single string man :: Maybe [T.Text], -- The package.json documentation implies that it is an error to have both bin and -- directories.bin in package.json. And then npm itself does exactly that. So, what -- this actually means: -- -- * If bin is present, in any form (even if it is empty), it takes -- precedence and directories.bin is ignored for our purposes -- * If bin is not present and directories.bin is present, every path in this directory -- gets symlinked to /usr/bin. Subdirectories gets symlinked too and are not traversed, -- so if you have /subdir, that gets symlinked as /usr/bin/subdir. binDirectory :: Maybe T.Text, -- Similar to bin, if man is present (even as an empty list), then this is ignored. -- Subdirectories are traversed. manDirectory :: Maybe T.Text, -- list of packagename, semver pairs dependencies :: Maybe [(T.Text, T.Text)] } deriving(Show) instance FromJSON PackageJSON where parseJSON = withObject "package.json" $ \v -> PackageJSON <$> v .: "name" <*> v .: "version" <*> v .:? "description" .!= "" <*> v .:? "homepage" <*> v .:? "license" .!= "" <*> parseBin v <*> v .:? "man" <*> parseDirectories v "bin" <*> parseDirectories v "man" <*> ((v .:? "dependencies") >>? parseTextObject) where -- custom handler for directories.bin and directories.man, to get rid of the intermediate object, -- and to skip if it's overriden by bin or man. parseDirectories :: Object -> T.Text -> Parser (Maybe T.Text) parseDirectories obj key = if HM.member key obj then return Nothing else case HM.lookup "directories" obj of Nothing -> return Nothing Just (Object v) -> v .:? key Just err -> typeMismatch "Object" err -- parse "bin", which has a mixed type parseBin :: Object -> Parser (Maybe [(T.Text, T.Text)]) parseBin obj = do -- retrieve the name for the String case name <- (obj .: "name") >>= withText "String" return case HM.lookup "bin" obj of Nothing -> return Nothing -- list of strings, return as a list of pairs Just v@(Object _) -> Just <$> parseTextObject v -- just a String, pair with the package name Just (String s) -> return $ Just [(name, s)] Just err -> typeMismatch "Object or String" err -- Convert an object that's all "key":"value" pairs to a list of (Text, Text) parseTextObject :: Value -> Parser [(T.Text, T.Text)] parseTextObject = withObject "Object" $ HM.foldrWithKey f (return []) where f :: T.Text -> Value -> Parser [(T.Text, T.Text)] -> Parser [(T.Text, T.Text)] f key val acc = withText "String" (\s -> ((key, s):) <$> acc) val readRegistryJSON :: (MonadError String m, MonadBaseControl IO m, MonadThrow m, MonadIO m) => String -> m PackageDist readRegistryJSON pkgname = do let uri = relativeTo (nullURI {uriPath = pkgname ++ "/latest"}) npmRegistry jsonData <- runConduitRes $ getFromURI uri .| sinkLbs either throwError return $ eitherDecode jsonData loadIntoMDDB :: MonadIO m => PackageJSON -> [Files] -> SqlPersistT m (Key Sources) loadIntoMDDB PackageJSON{..} files = do -- Create the project/source/build entries from the package.json data -- npm doesn't provide separate descriptions and summaries, so just leave projects.description blank -- upstream_vcs is usually the same as homepage, but we can't tell automatically so leave that blank too projectId <- insertProject $ Projects packageName description "" homepage "" sourceId <- insertSource $ Sources projectId license packageVersion "" fileIds <- insertFiles files void $ associateFilesWithSource fileIds sourceId -- load the bin information into the mddb as key/val pairs. /usr/bin symlinks will not be created -- until export, since there could be multiple versions of a package installed as dependencies and -- we only want the bin symlinks for the top-level ones. -- If there is an explicit bin list, that takes precendence, otherwise use directories.bin. case (bin, binDirectory) of (Just binlist, _) -> mapM_ (addBin sourceId) binlist (Nothing, Just binDir) -> addBinDir sourceId binDir _ -> return () -- similar thing for man pages case (man, manDirectory) of (Just manList, _) -> mapM_ (addMan sourceId) manList (Nothing, Just manDir) -> addManDir sourceId manDir _ -> return () -- save the requirements as build key/vals. These are the semver requirement ranges. -- When the source is "linked" into a build, and from to an exportable group, the semvers -- will be translated to exact-version requirements and stored in the requirements table. mapM_ (\(reqname, reqver) -> insertSourceKeyValue (TextKey "dependency") reqname (Just reqver) sourceId) $ fromMaybe [] dependencies -- mark the source as coming from npm -- TODO figure out a better way to express this kind of thing void $ insertSourceKeyValue (TextKey "npm") "" Nothing sourceId return sourceId where normaliseText :: T.Text -> T.Text normaliseText path = T.pack $ normalise $ T.unpack path -- package.json contains "bin", which is a list of (, ) pairs -- Insert as k=bin, v=, e= addBin :: MonadIO m => Key Sources -> (T.Text, T.Text) -> SqlPersistT m () addBin sourceId (binName, path) = void $ insertSourceKeyValue (TextKey "bin") binName (Just (normaliseText path)) sourceId -- package.json contains "directories.bin", which means everything in that directory -- should become a /usr/bin symlink, using the name of the file as the name of the symlink. -- No recursion, so if there's something like /subdir/subpath, subdir gets a symlink -- and subpath is otherwise ignored. -- Create KeyVal values like in addBin, using the filename for v. addBinDir :: MonadIO m => Key Sources -> T.Text -> SqlPersistT m () addBinDir sourceId binDir = let -- normalize out the leading "./" and any other funkiness binPrefix = normalise (T.unpack binDir) -- find paths where the directory component is the same as the prefix binFiles = filter (\p -> takeDirectory p == binPrefix) $ map (makeRelative "/" . T.unpack . filesPath) files in mapM_ (\p -> insertSourceKeyValue (TextKey "bin") (T.pack $ takeFileName p) (Just (T.pack p)) sourceId) binFiles addMan :: MonadIO m => Key Sources -> T.Text -> SqlPersistT m () addMan sourceId manName = void $ insertSourceKeyValue (TextKey "man") (normaliseText manName) Nothing sourceId -- Unlike directories.bin, we do need to recurse into this directory addManDir :: MonadIO m => Key Sources -> T.Text -> SqlPersistT m () addManDir sourceId manDir = let manPrefix = normalise (T.unpack manDir) paths = map (makeRelative "/" . T.unpack . filesPath) files manFiles = filter (\p -> (manPrefix `isPrefixOf` p) && (p =~ ("\\.[0-9]$" :: String))) paths in mapM_ (\p -> insertSourceKeyValue (TextKey "man") (T.pack p) Nothing sourceId) manFiles -- | Fetch an NPM from a given 'URI' and load it into the MDDB. This function must be -- run within the 'ReaderT' monad, which should be given an 'ImportState' record. This -- is how importing knows where to store the results. Errors will be printed to the -- screen. loadFromURI :: URI -> ReaderT ImportState IO () loadFromURI uri@URI{..} = do db <- stDB <$> ask repo <- stRepo <$> ask result <- runExceptT $ do -- Fetch the JSON describing the package distJson <- readRegistryJSON uriPath -- Get the URI to the tarball out of the JSON let distTarball = T.unpack $ tarball distJson distURI <- maybe (throwError $ "Error parsing dist URI: " ++ distTarball) return $ parseURI distTarball -- conduits for consuming the tar entries: -- this one loads the content into the content store and returns a list of (uninserted) Files records let pathPipe = tarEntryToFile repo .| CL.consume -- this one returns the parsed package.json let jsonPipe = parsePackageJson -- Zip them together into a sink returning a tuple let tarSink = getZipConduit ((,) <$> ZipConduit pathPipe <*> ZipConduit jsonPipe) -- Import the tarball to the content store (files, packageJson) <- runConduitRes $ getFromURI distURI .| ungzipIfCompressed .| CT.untar .| tarSink -- Insert the metadata checkAndRunSqlite (T.pack db) $ do source <- loadIntoMDDB packageJson files -- Link the dependencies for the source rebuildNPM source whenLeft result (\e -> liftIO $ print $ "Error importing " ++ show uri ++ ": " ++ show e) where -- TODO handle TarExceptions tarEntryToFile :: (MonadError String m, MonadThrow m, MonadResource m) => ContentStore -> Conduit CT.TarChunk m Files tarEntryToFile cs = -- Run the tar processing in a state with a map from FilePath to (ObjectDigest, CT.Size), -- so hardlinks can get the data they need from earlier entries. evalStateLC HM.empty $ CT.withEntries handleEntry where handleEntry :: (MonadState (HM.HashMap FilePath (ObjectDigest, CT.Size)) m, MonadError String m, MonadResource m) => CT.Header -> Conduit BS.ByteString m Files handleEntry header@CT.Header{..} = do let entryPath = CT.headerFilePath header -- Ignore the mode from tar. Set everything to 0644 if it's a file, 0755 if it's a directory let modeBits = if CT.headerFileType header == CT.FTDirectory then 0o0755 else 0o0644 -- Add the file type bits to the mode based on the tar header type let typeBits = case CT.headerFileType header of CT.FTNormal -> regularFileMode CT.FTHardLink -> regularFileMode CT.FTSymbolicLink -> symbolicLinkMode CT.FTCharacterSpecial -> characterSpecialMode CT.FTBlockSpecial -> blockSpecialMode CT.FTDirectory -> directoryMode CT.FTFifo -> namedPipeMode -- TODO? CT.FTOther _ -> 0 -- Make the start of a Files record. Ignore the user/group/mode from tar let baseFile = Files{filesPath = T.pack ("/" normalise entryPath), filesFile_user = "root", filesFile_group = "root", filesMtime = fromIntegral headerTime, filesMode = modeBits .|. fromIntegral typeBits, filesTarget = Nothing, filesCs_object = Nothing, filesSize = 0} file <- case CT.headerFileType header of -- for NormalFile, add the object to the content store, add the digest and size to the state, and add the digest in the record CT.FTNormal -> handleRegularFile baseFile entryPath headerPayloadSize -- For hard links, the content is the link target: look it up in the state and fill in the digest and size CT.FTHardLink -> handleHardLink baseFile -- for symlinks, set the target CT.FTSymbolicLink -> handleSymlink baseFile -- TODO? CT.FTOther code -> throwError $ "Unknown tar entry type " ++ show code -- TODO: need somewhere for block/char special major and minor -- otherwise nothing else has anything special _ -> return baseFile yield file handleRegularFile :: (MonadState (HM.HashMap FilePath (ObjectDigest, CT.Size)) m, MonadError String m, MonadResource m) => Files -> FilePath -> CT.Size -> Consumer BS.ByteString m Files handleRegularFile baseFile entryPath size = do digest <- toConsumer $ storeByteStringSink cs modify (HM.insert entryPath (digest, size)) return $ baseFile {filesSize = fromIntegral size, filesCs_object = Just $ convert digest} handleHardLink :: (MonadState (HM.HashMap FilePath (ObjectDigest, CT.Size)) m, MonadError String m) => Files -> Consumer BS.ByteString m Files handleHardLink baseFile = do -- Use the same ByteString -> FilePath unpacking that headerFilePath uses target <- S8.unpack <$> CC.fold (HM.lookup target <$> get) >>= maybe (throwError $ "Broken hard link to " ++ target) (\(digest, size) -> return $ baseFile {filesSize = fromIntegral size, filesCs_object = Just $ convert digest}) handleSymlink :: Monad m => Files -> Consumer BS.ByteString m Files handleSymlink baseFile = do target <- decodeUtf8 <$> CC.fold return $ baseFile {filesTarget = Just target} -- TODO handle TarExceptions parsePackageJson :: (MonadError String m, MonadThrow m) => Consumer CT.TarChunk m PackageJSON parsePackageJson = CT.withEntry handler >>= maybe parsePackageJson return where handler :: MonadError String m => CT.Header -> Consumer BS.ByteString m (Maybe PackageJSON) handler header = let path = makeRelative "/" $ normalise $ CT.headerFilePath header in -- Everything in an npm tarball is under "package/" if path == ("package" "package.json") then (eitherDecode <$> BSL.fromStrict <$> CC.fold) >>= either throwError (return . Just) else return Nothing