{- SPDX-FileCopyrightText: 2018-2019 Serokell - - SPDX-License-Identifier: MPL-2.0 -} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | Markdown documents markdownScanner. module Xrefcheck.Scanners.Markdown ( MarkdownConfig (..) , defGithubMdConfig , markdownScanner , markdownSupport , parseFileInfo , makeError ) where import Universum import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode, optFootnotes) import Control.Lens (_Just, makeLenses, makeLensesFor, (.=)) import Control.Monad.Trans.Writer.CPS (Writer, runWriter, tell) import Data.Aeson (FromJSON (..), genericParseJSON) import Data.ByteString.Lazy qualified as BSL import Data.DList qualified as DList import Data.Default (def) import Data.Text qualified as T import Data.Text.Lazy qualified as LT import Fmt (Buildable (..), blockListF, nameF, (+|), (|+)) import Text.HTML.TagSoup import Xrefcheck.Core import Xrefcheck.Scan import Xrefcheck.Util data MarkdownConfig = MarkdownConfig { mcFlavor :: Flavor } deriving stock (Generic) instance FromJSON (MarkdownConfig) where parseJSON = genericParseJSON aesonConfigOption defGithubMdConfig :: MarkdownConfig defGithubMdConfig = MarkdownConfig { mcFlavor = GitHub } instance Buildable Node where build (Node _mpos ty subs) = nameF (show ty) $ blockListF subs toPosition :: Maybe PosInfo -> Position toPosition = Position . \case Nothing -> Nothing Just PosInfo{..} | startLine == endLine -> Just $ startLine |+ ":" +| startColumn |+ "-" +| endColumn |+ "" | otherwise -> Just $ "" +| startLine |+ ":" +| startColumn |+ " - " +| endLine |+ ":" +| endColumn |+ "" -- | Extract text from the topmost node. nodeExtractText :: Node -> Text nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten where extractText = \case TEXT t -> t CODE t -> t _ -> "" nodeFlatten :: Node -> [NodeType] nodeFlatten (Node _pos ty subs) = ty : concatMap nodeFlatten subs data IgnoreMode = IMLink | IMParagraph | IMFile deriving stock (Eq) -- | "ignore link" pragmas in different places behave slightly different, -- so @IgnoreMode@ @Link@ is parametrized data IgnoreLinkState = ExpectingLinkInParagraph -- ^ When ignore annotation is inside @PARAGRAPH@ node, -- we expect a link to ignore later in this paragraph. -- We raise scan error if we see this status after -- traversing subnodes of a @PARAGRAPH@ node. | ExpectingLinkInSubnodes -- ^ If ignore annotation is not inside @PARAGRAPH@, then we expect a link -- in subtree of next node. We raise scan error if we see this status -- after traversing childs of any node that is not an ignore annotation. | ParentExpectsLink -- ^ When we have `ExpectingLinkInSubnodes`, we traverse subtree of some node, -- and we should change `IgnoreLinkState`, because it's not a problem if -- our node's first child doesn't contain a link. So this status means that -- we won't throw errors if we don't find a link for now deriving stock (Eq) data IgnoreModeState = IMSLink IgnoreLinkState | IMSParagraph | IMSFile deriving stock (Eq) -- | Bind `IgnoreMode` to its `PosInfo` so that we can tell where the -- corresponding annotation was declared. data Ignore = Ignore { _ignoreMode :: IgnoreModeState , _ignorePos :: Maybe PosInfo } makeLensesFor [("_ignoreMode", "ignoreMode")] 'Ignore data GetIgnoreMode = NotAnAnnotation | ValidMode IgnoreMode | InvalidMode Text deriving stock (Eq) data ScannerState = ScannerState { _ssIgnore :: Maybe Ignore , _ssParentNodeType :: Maybe NodeType -- ^ @cataNodeWithParentNodeInfo@ allows to get a @NodeType@ of parent node from this field } makeLenses ''ScannerState initialScannerState :: ScannerState initialScannerState = ScannerState { _ssIgnore = Nothing , _ssParentNodeType = Nothing } type ScannerM a = StateT ScannerState (Writer [ScanError]) a -- | A fold over a `Node`. cataNode :: (Maybe PosInfo -> NodeType -> [c] -> c) -> Node -> c cataNode f (Node pos ty subs) = f pos ty (cataNode f <$> subs) -- | Sets correct @_ssParentNodeType@ before running scanner on each node cataNodeWithParentNodeInfo :: (Maybe PosInfo -> NodeType -> [ScannerM a] -> ScannerM a) -> Node -> ScannerM a cataNodeWithParentNodeInfo f node = cataNode f' node where f' pos ty childScanners = f pos ty $ map (ssParentNodeType .= Just ty >>) childScanners -- | Find ignore annotations (ignore paragraph and ignore link) -- and remove nodes that should be ignored removeIgnored :: FilePath -> Node -> Writer [ScanError] Node removeIgnored fp = withIgnoreMode . cataNodeWithParentNodeInfo remove where remove :: Maybe PosInfo -> NodeType -> [ScannerM Node] -> ScannerM Node remove pos ty subs = do let node = Node pos ty [] scan <- use ssIgnore >>= \case -- When no `Ignore` state is set check next node for annotation, -- if found then set it as new `IgnoreMode` otherwise skip node. Nothing -> handleIgnoreMode pos ty subs $ getIgnoreMode node Just (Ignore mode modePos) -> case (mode, ty) of -- We expect to find a paragraph immediately after the -- `ignore paragraph` annotanion. If the paragraph is not -- found we should report an error. (IMSParagraph, PARAGRAPH) -> (ssIgnore .= Nothing) $> defNode (IMSParagraph, x) -> do lift . tell . makeError modePos fp . ParagraphErr $ prettyType x ssIgnore .= Nothing Node pos ty <$> sequence subs -- We don't expect to find an `ignore file` annotation here, -- since that annotation should be at the top of the file and -- the file should already be ignored when `checkIgnoreFile` is called. -- We should report an error if we find it anyway. (IMSFile, _) -> do lift . tell $ makeError modePos fp FileErr ssIgnore .= Nothing Node pos ty <$> sequence subs (IMSLink _, LINK {}) -> do ssIgnore .= Nothing return defNode (IMSLink ignoreLinkState, _) -> do when (ignoreLinkState == ExpectingLinkInSubnodes) $ ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink node' <- Node pos ty <$> sequence subs when (ignoreLinkState == ExpectingLinkInSubnodes) $ do currentIgnore <- use ssIgnore case currentIgnore of Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do lift $ tell $ makeError modePos fp LinkErr ssIgnore .= Nothing _ -> pass return node' when (ty == PARAGRAPH) $ use ssIgnore >>= \case Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> lift $ tell $ makeError pragmaPos fp LinkErr _ -> pass return scan handleIgnoreMode :: Maybe PosInfo -> NodeType -> [ScannerM Node] -> GetIgnoreMode -> ScannerM Node handleIgnoreMode pos nodeType subs = \case ValidMode mode -> do ignoreModeState <- case mode of IMLink -> use ssParentNodeType <&> IMSLink . \case Just PARAGRAPH -> ExpectingLinkInParagraph _ -> ExpectingLinkInSubnodes IMParagraph -> pure IMSParagraph IMFile -> pure IMSFile (ssIgnore .= Just (Ignore ignoreModeState correctPos)) $> defNode InvalidMode msg -> do lift . tell $ makeError correctPos fp $ UnrecognisedErr msg (ssIgnore .= Nothing) $> defNode NotAnAnnotation -> Node pos nodeType <$> sequence subs where correctPos = getPosition $ Node pos nodeType [] prettyType :: NodeType -> Text prettyType ty = let mType = safeHead $ words $ show ty in fromMaybe "" mType withIgnoreMode :: ScannerM Node -> Writer [ScanError] Node withIgnoreMode action = action `runStateT` initialScannerState >>= \case -- We expect `Ignore` state to be `Nothing` when we reach EOF, -- otherwise that means there was an annotation that didn't match -- any node, so we have to report that. (node, ScannerState {_ssIgnore = Just (Ignore mode pos)}) -> case mode of IMSParagraph -> do tell . makeError pos fp $ ParagraphErr "EOF" pure node IMSLink _ -> do tell $ makeError pos fp LinkErr pure node IMSFile -> do tell $ makeError pos fp FileErr pure node (node, _) -> pure node -- | Custom `foldMap` for source tree. foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a foldNode action node@(Node _ _ subs) = do a <- action node b <- concatForM subs (foldNode action) return (a <> b) type ExtractorM a = ReaderT MarkdownConfig (Writer [ScanError]) a -- | Extract information from source tree. nodeExtractInfo :: FilePath -> Node -> ExtractorM FileInfo nodeExtractInfo fp input@(Node _ _ nSubs) = do if checkIgnoreFile nSubs then return def else diffToFileInfo <$> (foldNode extractor =<< lift (removeIgnored fp input)) where extractor :: Node -> ExtractorM FileInfoDiff extractor node@(Node pos ty _) = case ty of HTML_BLOCK _ -> do return mempty HEADING lvl -> do flavor <- asks mcFlavor let aType = HeaderAnchor lvl let aName = headerToAnchor flavor $ nodeExtractText node let aPos = toPosition pos return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos} HTML_INLINE text -> do let mName = do tag <- safeHead $ parseTags text attributes <- case tag of TagOpen a attrs | T.toLower a == "a" -> Just attrs _ -> Nothing (_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes pure name case mName of Just aName -> do let aType = HandAnchor aPos = toPosition pos return $ FileInfoDiff mempty (pure $ Anchor {aType, aName, aPos}) Nothing -> do return mempty LINK url _ -> do let rName = nodeExtractText node rPos = toPosition pos link = if null url then rName else url let (rLink, rAnchor) = case T.splitOn "#" link of [t] -> (t, Nothing) t : ts -> (t, Just $ T.intercalate "#" ts) [] -> error "impossible" return $ FileInfoDiff (DList.singleton $ Reference {rName, rPos, rLink, rAnchor}) DList.empty _ -> return mempty -- | Check if there is `ignore file` at the beginning of the file, -- ignoring preceding comments if there are any. checkIgnoreFile :: [Node] -> Bool checkIgnoreFile nodes = let isSimpleComment :: Node -> Bool isSimpleComment node = isComment node && not (isIgnoreFile node) mIgnoreFile = safeHead $ dropWhile isSimpleComment nodes in maybe False isIgnoreFile mIgnoreFile where isComment :: Node -> Bool isComment = isJust . getCommentContent isIgnoreFile :: Node -> Bool isIgnoreFile = (ValidMode IMFile ==) . getIgnoreMode defNode :: Node defNode = Node Nothing DOCUMENT [] -- hard-coded default Node makeError :: Maybe PosInfo -> FilePath -> ScanErrorDescription -> [ScanError] makeError pos fp errDescription = one $ ScanError (toPosition pos) fp errDescription getCommentContent :: Node -> Maybe Text getCommentContent node = do txt <- getHTMLText node T.stripSuffix "-->" =<< T.stripPrefix "