{-# LANGUAGE NoImplicitPrelude, GeneralizedNewtypeDeriving, OverloadedStrings #-} {-| Module : Yarn.Lock.Parse Description : Parser for yarn.lock files Maintainer : Profpatsch Stability : experimental This module provides a parser for the AST of @yarn.lock@ files. -} module Yarn.Lock.Parse ( PackageFields(..), Package -- * Parsing -- ** Re-export , Parser -- ** Parsers , packageList , packageEntry -- * Internal Parsers , field, nestedField, simpleField , packageKeys ) where import Protolude hiding (try, some, many) import qualified Data.Char as Ch import qualified Data.List.NonEmpty as NE import qualified Data.Text as T import qualified Data.Map.Strict as M import Control.Monad (fail) import Text.Megaparsec as MP import qualified Text.Megaparsec.Char as MP import qualified Text.Megaparsec.Char.Lexer as MPL -- import qualified Data.MultiKeyedMap as MKM -- import Data.Proxy (Proxy(..)) import qualified Yarn.Lock.Types as YLT -- | We use a simple (pure) @Megaparsec@ parser. type Parser = Parsec Void Text -- | The @yarn.lock@ format doesn’t specifically include a fixed scheme, -- it’s just an unnecessary custom version of a list of fields. -- -- An field can either be a string or more fields w/ deeper indentation. -- -- The actual conversion to semantic structures needs to be done afterwards. newtype PackageFields = PackageFields (Map Text (Either Text PackageFields)) deriving (Show, Eq, Monoid) -- | A parsed 'Package' AST has one or more keys, a position in the original files -- and a collection of fields. type Package = YLT.Keyed (SourcePos, PackageFields) -- | Parse a complete yarn.lock into an abstract syntax tree, -- keeping the source positions of each package entry. packageList :: Parser [Package] packageList = MP.many $ (skipMany (comment <|> MP.string "\n")) *> packageEntry where comment :: Parser (Tokens Text) comment = MP.char '#' *> takeWhileP Nothing (/= '\n') -- | A single Package. -- -- Example: -- -- @ -- handlebars@^4.0.4: -- version "4.0.6" -- resolved "https://registry.yarnpkg.com/handlebars/-/handlebars-4.0.6.tgz#2ce4484850537f9c97a8026d5399b935c4ed4ed7" -- dependencies: -- async "^1.4.0" -- optimist "^0.6.1" -- source-map "^0.4.4" -- optionalDependencies: -- uglify-js "^2.6" -- " -- @ packageEntry :: Parser (YLT.Keyed (SourcePos, PackageFields)) packageEntry = label "package entry" $ do pos <- getPosition -- A package entry is a non-indented (keys, pkgs) <- nonIndented -- block that has a header of package keys -- and an indented part that contains fields $ indentedFieldsWithHeader packageKeys pure $ YLT.Keyed keys (pos, pkgs) -- | The list of PackageKeys that index the same Package -- -- @ -- align-text@^0.1.1, align-text@^0.1.3:\\n -- @ packageKeys :: Parser (NE.NonEmpty YLT.PackageKey) packageKeys = label "package keys" $ do firstEls <- many (try $ lexeme $ packageKey ":," <* MP.char ',') lastEl <- packageKey ":" <* MP.char ':' pure $ NE.fromList $ firstEls <> [lastEl] -- | A packageKey is @\\@\@; -- -- If the semver contains spaces, it is also quoted with @"@. packageKey :: [Char] -> Parser YLT.PackageKey packageKey separators = inString (pkgKey "\"") -- if no string delimiters is used we need to check for the separators -- this file format is shit :< <|> pkgKey separators "package key" where pkgKey :: [Char] -> Parser YLT.PackageKey pkgKey valueChars = label "package key" $ do key <- someTextOf (MP.noneOf valueChars) -- okay, here’s the rub: -- `@` is used for separation, but package names can also -- start with the `@` character (so-called “scoped packages”). -- Furthermore, versions can contain `@` as well. -- This file format is a pile of elephant shit. case breakDrop '@' key of ("", rest) -> case breakDrop '@' rest of -- scoped key with empty name ("", _) -> emptyKeyErr key -- scoped key ("@scope/package") (scopedName, ver) -> YLT.PackageKey <$> scoped (T.cons '@' scopedName) <*> pure ver -- just a simple key (name, ver) -> pure $ YLT.PackageKey (YLT.SimplePackageKey name) ver emptyKeyErr :: Text -> Parser a emptyKeyErr key = fail ("packagekey: package name can not be empty (is: " <> toS key <> ")") -- | Like 'T.breakOn', but drops the separator char. breakDrop :: Char -> Text -> (Text, Text) breakDrop c str = case T.breakOn (T.singleton c) str of (s, "") -> (s, "") (s, s') -> (s, T.drop 1 s') -- | Parses a (scoped) package key and throws an error if misformatted. scoped n = maybe (fail $ "packageKey: scoped variable must be of form @scope/package" <> " (is: " <> toS n <> ")") pure $ YLT.parsePackageKeyName n -- | Either a simple or a nested field. field :: Parser (Text, Either Text PackageFields) field = try nested <|> simple "field" where simple = fmap Left <$> simpleField nested = fmap Right <$> nestedField -- | A key-value pair, separated by space. -- Key any value may be enclosed in "". -- Returns key and value. simpleField :: Parser (Text, Text) simpleField = (,) <$> lexeme (strSymbolChars <|> symbolChars) -- valueChars may be in Strings or maybe not >: -- this file format is absolute garbage <*> (strValueChars <|> valueChars) "simple field" where valueChars, strValueChars :: Parser Text valueChars = someTextOf (MP.noneOf ("\n\r\"" :: [Char])) strSymbolChars = inString $ symbolChars strValueChars = inString $ valueChars -- as with packageKey semvers, this can be empty <|> (pure T.empty "an empty value field") -- | Similar to a @simpleField@, but instead of a string -- we get another block with deeper indentation. nestedField :: Parser (Text, PackageFields) nestedField = label "nested field" $ indentedFieldsWithHeader (symbolChars <* MP.char ':') -- internal parsers -- | There are two kinds of indented blocks: -- One where the header is the package -- and one where the header is already a package field key. indentedFieldsWithHeader :: Parser a -> Parser (a, PackageFields) indentedFieldsWithHeader header = indentBlock $ do -- … block that has a header of package keys hdr <- header -- … and an indented part that contains fields pure $ MPL.IndentSome Nothing (\fields -> pure (hdr, toPfs fields)) field where toPfs :: [(Text, Either Text PackageFields)] -> PackageFields toPfs = PackageFields . M.fromList -- | Characters allowed in key symbols. -- -- TODO: those are partly npm package names, so check the allowed symbols, too. -- -- Update: npm doesn’t specify the package name format, at all. -- Apart from the length. -- Update: According to https://docs.npmjs.com/misc/scope -- the package name format is “URL-safe characters, no leading dots or underscores” TODO symbolChars :: Parser Text symbolChars = label "key symbol" $ someTextOf $ MP.satisfy (\c -> Ch.isAscii c && (Ch.isLower c || Ch.isUpper c || Ch.isNumber c || c `elem` special)) where special = "-_.@/" :: [Char] -- text versions of parsers & helpers someTextOf :: Parser Char -> Parser Text someTextOf c = T.pack <$> some c -- | parse everything as inside a string inString :: Parser a -> Parser a inString = between (MP.char '"') (MP.char '"') -- lexers -- | Parse whitespace. space :: Parser () space = MPL.space (void MP.spaceChar) (MPL.skipLineComment "# ") (void $ MP.satisfy (const False)) -- | Parse a lexeme. lexeme :: Parser a -> Parser a lexeme = MPL.lexeme space -- | Ensure parser is not indented. nonIndented :: Parser a -> Parser a nonIndented = MPL.nonIndented space indentBlock :: Parser (MPL.IndentOpt Parser a b) -> Parser a indentBlock = MPL.indentBlock space