diff --git a/CHANGES.md b/CHANGES.md
index a4d037eb..3663d2ae 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -36,11 +36,13 @@ Unreleased
+ Now we call references to anchors in current file (e.g. `[a](#b)`) as
`file-local` references instead of calling them `current file` (which was ambiguous).
* [#233](https://github.com/serokell/xrefcheck/pull/233)
- + Now xrefxcheck does not follow redirect links by default. It fails for permanent
+ + Now xrefcheck does not follow redirect links by default. It fails for permanent
redirect responses (i.e. 301 and 308) and passes for temporary ones (i.e. 302, 303, 307).
* [#231](https://github.com/serokell/xrefcheck/pull/231)
+ Anchor analysis takes now into account the appropriate case-sensitivity depending on
the configured Markdown flavour.
+* [#240](https://github.com/serokell/xrefcheck/pull/240)
+ + Now xrefcheck is able to detect possible copy-pastes relying on links and their names.
0.2.2
==========
diff --git a/README.md b/README.md
index 73d76c56..81c60f97 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ Comparing to alternative solutions, this tool tries to achieve the following poi
* Supports external links (`http`, `https`, `ftp` and `ftps`).
* Detects broken and ambiguous anchors in local links.
* Integration with GitHub Actions.
+* Detects possible bad copy-pastes of links.
## Dependencies [↑](#xrefcheck)
@@ -148,6 +149,21 @@ There are several ways to fix this:
* By default, `xrefcheck` will ignore links to localhost.
* This behavior can be disabled by removing the corresponding entry from the `ignoreExternalRefsTo` list in the config file.
+1. How do I disable copy-paste check for specific links?
+ * Add a `` annotation before the link:
+ ```md
+
+ Links with bad copypaste:
+ [good link](https://good.link.uri/).
+ [copypasted link](https://good.link.uri/).
+ ```
+ ```md
+ A [good link](https://good.link.uri/)
+ followed by an [copypasted intentionally](https://good.link.uri/).
+ ```
+ * You can use a `` annotation to disable copy-paste check in a paragraph.
+ * You can use a `` annotation at the top of the file to disable copy-paste check within an entire file.
+
## Further work [↑](#xrefcheck)
- [ ] Support link detection in different languages, not only Markdown.
diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs
index 3863285f..e6ad91fd 100644
--- a/src/Xrefcheck/Core.hs
+++ b/src/Xrefcheck/Core.hs
@@ -12,6 +12,7 @@ module Xrefcheck.Core where
import Universum
import Control.Lens (makeLenses)
+import Control.Lens.Combinators (makeLensesWith)
import Data.Aeson (FromJSON (..), withText)
import Data.Char (isAlphaNum)
import Data.Char qualified as C
@@ -70,14 +71,17 @@ instance Given ColorMode => Buildable Position where
-- | Full info about a reference.
data Reference = Reference
- { rName :: Text
+ { rName :: Text
-- ^ Text displayed as reference.
- , rLink :: Text
+ , rLink :: Text
-- ^ File or site reference points to.
- , rAnchor :: Maybe Text
+ , rAnchor :: Maybe Text
-- ^ Section or custom anchor tag.
- , rPos :: Position
+ , rPos :: Position
+ , rCheckCopyPaste :: Bool
+ -- ^ Whether to check bad copy/paste for this link
} deriving stock (Show, Generic, Eq, Ord)
+makeLensesWith postfixFields ''Reference
-- | Context of anchor.
data AnchorType
@@ -102,9 +106,9 @@ data FileInfoDiff = FileInfoDiff
}
makeLenses ''FileInfoDiff
-diffToFileInfo :: FileInfoDiff -> FileInfo
-diffToFileInfo (FileInfoDiff refs anchors) =
- FileInfo (DList.toList refs) (DList.toList anchors)
+diffToFileInfo :: Bool -> FileInfoDiff -> FileInfo
+diffToFileInfo cpcEnabledInFile (FileInfoDiff refs anchors) =
+ FileInfo (DList.toList refs) (DList.toList anchors) cpcEnabledInFile
instance Semigroup FileInfoDiff where
FileInfoDiff a b <> FileInfoDiff c d = FileInfoDiff (a <> c) (b <> d)
@@ -114,13 +118,14 @@ instance Monoid FileInfoDiff where
-- | All information regarding a single file we care about.
data FileInfo = FileInfo
- { _fiReferences :: [Reference]
- , _fiAnchors :: [Anchor]
+ { _fiReferences :: [Reference]
+ , _fiAnchors :: [Anchor]
+ , _fiCopyPasteCheck :: Bool
} deriving stock (Show, Generic)
makeLenses ''FileInfo
instance Default FileInfo where
- def = diffToFileInfo mempty
+ def = diffToFileInfo True mempty
data ScanPolicy
= OnlyTracked
diff --git a/src/Xrefcheck/Scan.hs b/src/Xrefcheck/Scan.hs
index da312a0e..017e8606 100644
--- a/src/Xrefcheck/Scan.hs
+++ b/src/Xrefcheck/Scan.hs
@@ -117,18 +117,27 @@ data ScanErrorDescription
= LinkErr
| FileErr
| ParagraphErr Text
+ | LinkErrCpc
+ | FileErrCpc
+ | ParagraphErrCpc Text
| UnrecognisedErr Text
deriving stock (Show, Eq)
instance Buildable ScanErrorDescription where
build = \case
LinkErr -> [int||Expected a LINK after "ignore link" annotation|]
+ LinkErrCpc -> [int||Expected a LINK after "no duplication check in link" annotation|]
FileErr -> [int||Annotation "ignore all" must be at the top of \
markdown or right after comments at the top|]
+ FileErrCpc -> [int||Annotation "no duplication check in file" must be at the top of \
+ markdown or right after comments at the top|]
ParagraphErr txt -> [int||Expected a PARAGRAPH after \
"ignore paragraph" annotation, but found #{txt}|]
- UnrecognisedErr txt -> [int||Unrecognised option "#{txt}" perhaps you meant \
- <"ignore link"|"ignore paragraph"|"ignore all">|]
+ ParagraphErrCpc txt -> [int||Expected a PARAGRAPH after \
+ "no duplication check in paragraph" annotation, but found #{txt}|]
+ UnrecognisedErr txt -> [int||Unrecognised option "#{txt}", perhaps you meant
+ "ignore "
+ or "no duplication check in "?|]
specificFormatsSupport :: [([Extension], ScanAction)] -> FormatsSupport
specificFormatsSupport formats = \ext -> M.lookup ext formatsMap
diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs
index ce684b39..0829417a 100644
--- a/src/Xrefcheck/Scanners/Markdown.hs
+++ b/src/Xrefcheck/Scanners/Markdown.hs
@@ -54,6 +54,24 @@ instance Buildable C.Node where
build (C.Node _mpos ty mSubs) = nameF (show ty) $
maybe "[]" interpolateBlockListF (nonEmpty mSubs)
+data Node a = Node
+ { _ndPos :: Maybe PosInfo
+ , _ndType :: NodeType
+ , _ndInfo :: a
+ , _ndSubs :: [Node a]
+ }
+
+instance Buildable (Node a) where
+ build (Node _mpos ty _info mSubs) = nameF (show ty) $
+ maybe "[]" interpolateBlockListF (nonEmpty mSubs)
+
+-- Here and below CPC stands for "copy/paste check"
+type NodeCPC = Node CopyPasteCheck
+
+newtype CopyPasteCheck = CopyPasteCheck
+ { cpcShouldCheck :: Bool
+ } deriving stock (Show, Eq, Generic)
+
toPosition :: Maybe PosInfo -> Position
toPosition = Position . \case
Nothing -> Nothing
@@ -68,7 +86,7 @@ toPosition = Position . \case
|]
-- | Extract text from the topmost node.
-nodeExtractText :: (C.Node) -> Text
+nodeExtractText :: Node info -> Text
nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten
where
extractText = \case
@@ -76,8 +94,8 @@ nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten
CODE t -> t
_ -> ""
- nodeFlatten :: (C.Node) -> [NodeType]
- nodeFlatten (C.Node _pos ty subs) = ty : concatMap nodeFlatten subs
+ nodeFlatten :: Node info -> [NodeType]
+ nodeFlatten (Node _pos ty _info subs) = ty : concatMap nodeFlatten subs
data IgnoreMode
@@ -120,6 +138,7 @@ makeLensesFor [("_ignoreMode", "ignoreMode")] 'Ignore
data Annotation
= IgnoreAnnotation IgnoreMode
+ | IgnoreCopyPasteCheck IgnoreMode
| InvalidAnnotation Text
deriving stock (Eq)
@@ -127,6 +146,8 @@ data Annotation
data ScannerState = ScannerState
{ _ssIgnore :: Maybe Ignore
+ , _ssIgnoreCopyPasteCheck :: Maybe Ignore
+ , _ssParagraphExpectedAfterCpcAnnotation :: Bool
, _ssParentNodeType :: Maybe NodeType
-- ^ @cataNodeWithParentNodeInfo@ allows to get a @NodeType@ of parent node from this field
}
@@ -135,7 +156,9 @@ makeLenses ''ScannerState
initialScannerState :: ScannerState
initialScannerState = ScannerState
{ _ssIgnore = Nothing
+ , _ssIgnoreCopyPasteCheck = Nothing
, _ssParentNodeType = Nothing
+ , _ssParagraphExpectedAfterCpcAnnotation = False
}
type ScannerM a = StateT ScannerState (Writer [ScanError]) a
@@ -155,40 +178,49 @@ cataNodeWithParentNodeInfo f node = cataNode f' node
map (ssParentNodeType .= Just ty >>) childScanners
-- | Find ignore annotations (ignore paragraph and ignore link)
--- and remove nodes that should be ignored.
-processAnnotations :: FilePath -> C.Node -> Writer [ScanError] C.Node
+-- and remove nodes that should be ignored;
+-- find copy/paste check annotations (ignore for paragraph and for link)
+-- and label nodes with a boolean meaning whether they should be
+-- copy/paste checked.
+processAnnotations :: FilePath -> C.Node -> Writer [ScanError] NodeCPC
processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
where
process
:: Maybe PosInfo
-> NodeType
- -> [ScannerM C.Node]
- -> ScannerM C.Node
+ -> [ScannerM NodeCPC]
+ -> ScannerM NodeCPC
process pos ty subs = do
let node = C.Node pos ty []
- use ssIgnore >>= \ign -> do
+ use ssIgnore >>= \ign ->
+ use ssIgnoreCopyPasteCheck >>= \ignCPC -> do
-- When no `Ignore` state is set check next node for annotation,
-- if found then set it as new `IgnoreMode` otherwise skip node.
- let mbAnnotation = getAnnotation node
- case mbAnnotation of
+ let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC
+ let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs
+ case getAnnotation node of
Just ann -> handleAnnotation pos ty ann
Nothing -> do
case ty of
- PARAGRAPH -> handleParagraph ign pos ty subs
- LINK {} -> handleLink ign pos ty subs
- IMAGE {} -> handleLink ign pos ty subs
- _ -> handleOther ign pos ty subs
-
- handleLink ::
- Maybe Ignore ->
- Maybe PosInfo ->
- NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- handleLink ign pos ty subs = do
- let traverseChildren = C.Node pos ty <$> sequence subs
- -- It can be checked that it's correct for all the cases
+ PARAGRAPH -> handleParagraph ign traverseChildren
+ LINK {} -> handleLink ign ty traverseChildren
+ IMAGE {} -> handleLink ign ty traverseChildren
+ _ -> handleOther ign ty traverseChildren
+
+ handleLink
+ :: Maybe Ignore
+ -> NodeType
+ -> ScannerM NodeCPC
+ -> ScannerM NodeCPC
+ handleLink ign ty traverseChildren = do
+ -- It's common for all ignore states
ssIgnore .= Nothing
+ -- If there was a copy/paste ignore annotation that expected link,
+ -- reset this state
+ resetCpcIgnoreIfLink
+ -- If right now there was a copy/paste ignore annotation for paragraph,
+ -- emit an error and reset these states.
+ reportExpectedParagraphAfterIgnoreCpcAnnotation ty
case ign of
Nothing -> traverseChildren
@@ -198,75 +230,125 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
Just (Ignore (IMSLink _) _) -> do
pure defNode
- handleParagraph ::
- Maybe Ignore ->
- Maybe PosInfo ->
- NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- handleParagraph ign pos ty subs = do
- let traverseChildren = C.Node pos ty <$> sequence subs
+ handleParagraph
+ :: Maybe Ignore
+ -> ScannerM NodeCPC
+ -> ScannerM NodeCPC
+ handleParagraph ign traverseChildren = do
+ -- If a new paragraph was expected (this stands for True), now we
+ -- don't expect paragraphs any more.
+ ssParagraphExpectedAfterCpcAnnotation .= False
node <- case ign of
- Nothing -> traverseChildren
+ Nothing ->
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
Just (Ignore IMSParagraph _) -> do
ssIgnore .= Nothing
pure defNode
Just (Ignore (IMSLink ignoreLinkState) modePos) ->
- traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs
+ wrapTraverseNodeWithLinkExpected ignoreLinkState modePos $
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
+
+ ssIgnoreCopyPasteCheck .= Nothing
use ssIgnore >>= \case
- Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) ->
+ Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do
lift $ tell $ makeError pragmaPos fp LinkErr
+ ssIgnore .= Nothing
_ -> pass
+ use ssIgnoreCopyPasteCheck >>= \case
+ Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do
+ lift $ tell $ makeError pragmaPos fp LinkErrCpc
+ ssIgnoreCopyPasteCheck .= Nothing
+ _ -> pass
+
pure node
- handleOther ::
- Maybe Ignore ->
- Maybe PosInfo ->
- NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- handleOther ign pos ty subs = do
- let traverseChildren = C.Node pos ty <$> sequence subs
+ handleOther
+ :: Maybe Ignore
+ -> NodeType
+ -> ScannerM NodeCPC
+ -> ScannerM NodeCPC
+ handleOther ign ty traverseChildren = do
+ -- If right now there was a copy/paste ignore annotation for paragraph,
+ -- emit an error and reset these states.
+ reportExpectedParagraphAfterIgnoreCpcAnnotation ty
case ign of
- Nothing -> traverseChildren
+ Nothing ->
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
Just (Ignore IMSParagraph modePos) -> do
reportExpectedParagraphAfterIgnoreAnnotation modePos ty
ssIgnore .= Nothing
- traverseChildren
- Just (Ignore (IMSLink ignoreLinkState) modePos) -> do
- traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
+ Just (Ignore (IMSLink ignoreLinkState) modePos) ->
+ wrapTraverseNodeWithLinkExpected ignoreLinkState modePos $
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
reportExpectedParagraphAfterIgnoreAnnotation :: Maybe PosInfo -> NodeType -> ScannerM ()
reportExpectedParagraphAfterIgnoreAnnotation modePos ty =
lift . tell . makeError modePos fp . ParagraphErr $ prettyType ty
- traverseNodeWithLinkExpected ::
- IgnoreLinkState ->
- Maybe PosInfo ->
- Maybe PosInfo ->
- NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs = do
- when (ignoreLinkState == ExpectingLinkInSubnodes) $
- ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink
- node' <- C.Node pos ty <$> sequence subs
- when (ignoreLinkState == ExpectingLinkInSubnodes) $ do
+ resetCpcIgnoreIfLink :: ScannerM ()
+ resetCpcIgnoreIfLink = do
+ curCpcIgnore <- use ssIgnoreCopyPasteCheck
+ case _ignoreMode <$> curCpcIgnore of
+ Just (IMSLink _) -> ssIgnoreCopyPasteCheck .= Nothing
+ _ -> pass
+
+ reportExpectedParagraphAfterIgnoreCpcAnnotation
+ :: NodeType
+ -> ScannerM ()
+ reportExpectedParagraphAfterIgnoreCpcAnnotation ty =
+ use ssIgnoreCopyPasteCheck >>= \case
+ Just (Ignore IMSParagraph modePos) ->
+ whenM (use ssParagraphExpectedAfterCpcAnnotation) $ do
+ lift . tell . makeError modePos fp . ParagraphErrCpc $ prettyType ty
+ ssParagraphExpectedAfterCpcAnnotation .= False
+ ssIgnoreCopyPasteCheck .= Nothing
+ _ -> pass
+
+ wrapTraverseNodeWithLinkExpected
+ :: IgnoreLinkState
+ -> Maybe PosInfo
+ -> ScannerM NodeCPC
+ -> ScannerM NodeCPC
+ wrapTraverseNodeWithLinkExpected ignoreLinkState modePos =
+ if ignoreLinkState /= ExpectingLinkInSubnodes
+ then id
+ else \traverse' -> do
+ ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink
+ node' <- traverse'
currentIgnore <- use ssIgnore
case currentIgnore of
Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do
lift $ tell $ makeError modePos fp LinkErr
ssIgnore .= Nothing
_ -> pass
- return node'
+ return node'
+
+ wrapTraverseNodeWithLinkExpectedForCpc
+ :: ScannerM NodeCPC
+ -> ScannerM NodeCPC
+ wrapTraverseNodeWithLinkExpectedForCpc traverse' = do
+ ignoreCpc <- use ssIgnoreCopyPasteCheck
+ case ignoreCpc of
+ Just (Ignore (IMSLink ExpectingLinkInSubnodes) modePos) -> do
+ ssIgnoreCopyPasteCheck . _Just . ignoreMode .= IMSLink ParentExpectsLink
+ node' <- traverse'
+ currentIgnore <- use ssIgnoreCopyPasteCheck
+ case currentIgnore of
+ Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do
+ lift $ tell $ makeError modePos fp LinkErrCpc
+ ssIgnoreCopyPasteCheck .= Nothing
+ _ -> pass
+ return node'
+ _ -> traverse'
handleAnnotation
:: Maybe PosInfo
-> NodeType
-> Annotation
- -> ScannerM C.Node
+ -> ScannerM NodeCPC
handleAnnotation pos nodeType = \case
IgnoreAnnotation mode -> do
let reportIfThereWasAnnotation :: ScannerM ()
@@ -300,6 +382,41 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
whenJust mbIgnoreModeState $ \ignoreModeState ->
(ssIgnore .= Just (Ignore ignoreModeState correctPos))
pure defNode
+ IgnoreCopyPasteCheck mode -> do
+ mbIgnoreModeState <- case mode of
+ IMLink -> use ssParentNodeType <&> Just . IMSLink . \case
+ Just PARAGRAPH -> ExpectingLinkInParagraph
+ _ -> ExpectingLinkInSubnodes
+
+ IMParagraph -> do
+ ssParagraphExpectedAfterCpcAnnotation .= True
+ pure $ Just IMSParagraph
+
+ -- We don't expect to find an `ignore all` annotation here,
+ -- since that annotation should be at the top of the file and
+ -- any correct annotations should be handled in `checkGlobalAnnotations`
+ -- function.
+ IMAll -> do
+ lift . tell $ makeError correctPos fp FileErrCpc
+ pure Nothing
+
+ whenJust mbIgnoreModeState $ \ignoreModeState -> do
+ let setupNewCpcState = ssIgnoreCopyPasteCheck .= Just (Ignore ignoreModeState correctPos)
+ use ssIgnoreCopyPasteCheck >>= \case
+ Nothing -> setupNewCpcState
+ Just (Ignore curIgn prevPos)
+ | IMSLink _ <- curIgn -> do
+ lift $ tell $ makeError prevPos fp LinkErrCpc
+ setupNewCpcState
+ | IMSParagraph <- curIgn -> case ignoreModeState of
+ IMSParagraph -> do
+ lift . tell . makeError prevPos fp . ParagraphErrCpc $ prettyType nodeType
+ setupNewCpcState
+ -- It's OK to have link annotation when paragraph is ignored
+ -- because in this case all links and all annotations are ignored.
+ _ -> pass
+ pure defNode
+
InvalidAnnotation msg -> do
lift . tell $ makeError correctPos fp $ UnrecognisedErr msg
pure defNode
@@ -312,8 +429,8 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
in fromMaybe "" mType
withIgnoreMode
- :: ScannerM C.Node
- -> Writer [ScanError] C.Node
+ :: ScannerM (Node info)
+ -> Writer [ScanError] (Node info)
withIgnoreMode action = action `runStateT` initialScannerState >>= \case
-- We expect `Ignore` state to be `Nothing` when we reach EOF,
-- otherwise that means there was an annotation that didn't match
@@ -328,8 +445,8 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
(node, _) -> pure node
-- | Custom `foldMap` for source tree.
-foldNode :: (Monoid a, Monad m) => (C.Node -> m a) -> C.Node -> m a
-foldNode action node@(C.Node _ _ subs) = do
+foldNode :: (Monoid a, Monad m) => (Node info -> m a) -> Node info -> m a
+foldNode action node@(Node _ _ _ subs) = do
a <- action node
b <- concatForM subs (foldNode action)
return (a <> b)
@@ -342,16 +459,16 @@ nodeExtractInfo
-> C.Node
-> ExtractorM FileInfo
nodeExtractInfo fp (C.Node nPos nTy nSubs) = do
- let (ignoreFile, contentNodes) = checkGlobalAnnotations nSubs
+ let (ignoreFile, ignoreCpcInFile, contentNodes) = checkGlobalAnnotations nSubs
if ignoreFile
then return def
- else diffToFileInfo <$>
+ else diffToFileInfo (not ignoreCpcInFile) <$>
(lift (processAnnotations fp $ C.Node nPos nTy contentNodes)
>>= foldNode extractor)
where
- extractor :: C.Node -> ExtractorM FileInfoDiff
- extractor node@(C.Node pos ty _) =
+ extractor :: NodeCPC -> ExtractorM FileInfoDiff
+ extractor node@(Node pos ty info _) =
case ty of
HTML_BLOCK _ -> do
return mempty
@@ -401,15 +518,17 @@ nodeExtractInfo fp (C.Node nPos nTy nSubs) = do
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"
return $ FileInfoDiff
- (DList.singleton $ Reference {rName, rPos, rLink, rAnchor})
+ (DList.singleton $
+ Reference {rName, rPos, rLink, rAnchor, rCheckCopyPaste = cpcShouldCheck info})
DList.empty
-- | Check for global annotations, ignoring simple comments if there are any.
-checkGlobalAnnotations :: [C.Node] -> (Bool, [C.Node])
+checkGlobalAnnotations :: [C.Node] -> (Bool, Bool, [C.Node])
checkGlobalAnnotations nodes = do
let (headerNodes, contentsNodes) = span isHeaderNode nodes
ignoreFile = any isIgnoreFile headerNodes
- (ignoreFile, contentsNodes)
+ ignoreCpcInFile = any isIgnoreCpcWithinFile headerNodes
+ (ignoreFile, ignoreCpcInFile, contentsNodes)
where
isSimpleComment :: C.Node -> Bool
isSimpleComment node = do
@@ -420,15 +539,20 @@ checkGlobalAnnotations nodes = do
isIgnoreFile :: C.Node -> Bool
isIgnoreFile = (Just (IgnoreAnnotation IMAll) ==) . getAnnotation
+ isIgnoreCpcWithinFile :: C.Node -> Bool
+ isIgnoreCpcWithinFile = (Just (IgnoreCopyPasteCheck IMAll) ==) . getAnnotation
+
isHeaderNode :: C.Node -> Bool
isHeaderNode node =
any ($ node)
[ isSimpleComment
, isIgnoreFile
+ , isIgnoreCpcWithinFile
]
-defNode :: C.Node
-defNode = C.Node Nothing DOCUMENT [] -- hard-coded default Node
+-- | Hard-coded default Node
+defNode :: NodeCPC
+defNode = Node Nothing DOCUMENT (CopyPasteCheck False) []
makeError
:: Maybe PosInfo
@@ -473,6 +597,8 @@ textToMode :: Text -> Annotation
textToMode annText = case wordsList of
("ignore" : [x])
| Just ignMode <- getIgnoreMode x -> IgnoreAnnotation ignMode
+ ("no" : "duplication" : "check" : "in" : [x])
+ | Just ignMode <- getIgnoreMode x -> IgnoreCopyPasteCheck ignMode
_ -> InvalidAnnotation annText
where
wordsList = words annText
@@ -482,6 +608,7 @@ getIgnoreMode = \case
"link" -> Just IMLink
"paragraph" -> Just IMParagraph
"all" -> Just IMAll
+ "file" -> Just IMAll
_ -> Nothing
parseFileInfo :: MarkdownConfig -> FilePath -> LT.Text -> (FileInfo, [ScanError])
diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs
index b7daca93..4faddb7e 100644
--- a/src/Xrefcheck/Verify.hs
+++ b/src/Xrefcheck/Verify.hs
@@ -409,7 +409,7 @@ verifyRepo
-- user that we are scanning only files
-- added to Git while gathering RepoInfo.
- toCheckCopyPaste = map (second _fiReferences) filesToScan
+ toCheckCopyPaste = map (second _fiReferences) $ filter (_fiCopyPasteCheck . snd) filesToScan
toScan = concatMap (\(file, fileInfo) -> map (file,) $ _fiReferences fileInfo) filesToScan
copyPasteErrors = if scCopyPasteCheckEnabled cScanners
then [ res
@@ -460,7 +460,8 @@ checkCopyPaste file refs = do
let getLinkAndAnchor x = (rLink x, rAnchor x)
groupedRefs =
L.groupBy ((==) `on` getLinkAndAnchor) $
- sortBy (compare `on` getLinkAndAnchor) refs
+ sortBy (compare `on` getLinkAndAnchor) $
+ filter rCheckCopyPaste refs
concatMap checkGroup groupedRefs
where
checkGroup :: [Reference] -> [CopyPasteCheckResult]
diff --git a/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs b/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs
index fbe09ba6..6db45402 100644
--- a/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs
+++ b/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs
@@ -26,12 +26,14 @@ test_copyPasteCheck = testGroup "Copypaste check"
anchor = Just "heading"
differentAnchor = Nothing
defPos = Position Nothing
- original1 = Reference "_- First - - File" link anchor defPos
- original2 = Reference "_- First - fi - le" link anchor defPos
- notCopied = Reference " Link 2 " link differentAnchor defPos
- copied1 = Reference " foo bar" link anchor defPos
- copied2 = Reference " Baz quux" link anchor defPos
- input = [original1, original2, notCopied, copied1, copied2]
+ original1 = Reference "_- First - - File" link anchor defPos True
+ original2 = Reference "_- First - fi - le" link anchor defPos True
+ notCopied = Reference " Link 2 " link differentAnchor defPos True
+ copied1 = Reference " foo bar" link anchor defPos True
+ copied2 = Reference " Baz quux" link anchor defPos True
+ -- Do not report link with False
+ copied3 = Reference " Qib yse" link anchor defPos False
+ input = [original1, original2, notCopied, copied1, copied2, copied3]
res = checkCopyPaste testPath input
expectedRes =
-- only first matching link is shown in the output
@@ -43,19 +45,30 @@ test_copyPasteCheck = testGroup "Copypaste check"
let link = "./first-file"
anchor = Just "heading"
defPos = Position Nothing
- original1 = Reference "_Foo bar" link anchor defPos
- original2 = Reference " Baz quux" link anchor defPos
- original3 = Reference " Foo qubarx" link anchor defPos
+ original1 = Reference "_Foo bar" link anchor defPos True
+ original2 = Reference " Baz quux" link anchor defPos True
+ original3 = Reference " Foo qubarx" link anchor defPos True
input = [original1, original2, original3]
res = checkCopyPaste testPath input
expectedRes = []
res @?= expectedRes
- , testCase "Check external links" $ do
+ , testCase "Succeed if there is a link with a matching name but with check disabled" $ do
+ let link = "./first-file"
+ anchor = Just "heading"
+ defPos = Position Nothing
+ original1 = Reference "_-_first-fIlE-_-" link anchor defPos False
+ original2 = Reference " Baz quux" link anchor defPos True
+ original3 = Reference " Foo qubarx" link anchor defPos True
+ input = [original1, original2, original3]
+ res = checkCopyPaste testPath input
+ expectedRes = []
+ res @?= expectedRes
+ , testCase "Report external links" $ do
let link = "https://github.com"
anchor = Nothing
defPos = Position Nothing
- original = Reference "github" link anchor defPos
- copied = Reference "gitlab" link anchor defPos
+ original = Reference "github" link anchor defPos True
+ copied = Reference "gitlab" link anchor defPos True
input = [original, copied]
res = checkCopyPaste testPath input
expectedRes =
diff --git a/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs b/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs
index c905bb5c..a2cb14a6 100644
--- a/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs
+++ b/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs
@@ -18,7 +18,8 @@ import Xrefcheck.Scanners.Markdown
test_ignoreAnnotations :: [TestTree]
test_ignoreAnnotations =
- [ testGroup "Parsing failures"
+ [ testGroup "Parsing failures" $
+ [ testGroup "Ignore annotations"
[ testCase "Check if broken link annotation produce error" do
let file = "tests/markdowns/with-annotations/no_link.md"
errs <- getErrs file
@@ -31,35 +32,71 @@ test_ignoreAnnotations =
let file = "tests/markdowns/with-annotations/unexpected_ignore_file.md"
errs <- getErrs file
errs @?= makeError (Just $ PosInfo 9 1 9 29) file FileErr
- , testCase "Check if broken unrecognised annotation produce error" do
+ ]
+ , testGroup "Ignore copypaste check annotations"
+ [ testCase "Check if broken copypaste link annotation produce error" do
+ let file = "tests/markdowns/with-annotations/no_link_cpc.md"
+ errs <- getErrs file
+ errs @?= makeError (Just $ PosInfo 7 1 7 48) file LinkErrCpc
+ , testCase "Check if broken copypaste paragraph annotation produce error" do
+ let file = "tests/markdowns/with-annotations/no_paragraph_cpc.md"
+ errs <- getErrs file
+ errs @?= makeError (Just $ PosInfo 7 1 7 53) file (ParagraphErrCpc "HEADING")
+ , testCase "Check if broken copypaste ignore file annotation produce error" do
+ let file = "tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md"
+ errs <- getErrs file
+ errs @?= makeError (Just $ PosInfo 9 1 9 47) file FileErrCpc
+ ]
+ , testCase "Check if broken unrecognised annotation produce error" do
let file = "tests/markdowns/with-annotations/unrecognised_option.md"
errs <- getErrs file
errs @?= makeError (Just $ PosInfo 7 1 7 46) file (UnrecognisedErr "ignore unrecognised-option")
- ]
- , testGroup "\"ignore link\" mode"
- [ testCase "Check \"ignore link\" performance" $ do
- let file = "tests/markdowns/with-annotations/ignore_link.md"
- (fi, errs) <- parse GitHub file
- getRefs fi @?=
- ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"]
- errs @?= makeError (Just $ PosInfo 42 1 42 31) file LinkErr
- ]
- , testGroup "\"ignore paragraph\" mode"
- [ testCase "Check \"ignore paragraph\" performance" $ do
- (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph.md"
- getRefs fi @?= ["blog", "contacts"]
- errs @?= []
- ]
- , testGroup "\"ignore all\" mode"
- [ testCase "Check \"ignore all\" performance" $ do
- (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file.md"
- getRefs fi @?= []
- errs @?= []
- ]
+ ]
+ , testGroup "Check ignore pragmas" $
+ [ testGroup "\"ignore link\" mode"
+ [ testCase "Check \"ignore link\" performance" $ do
+ let file = "tests/markdowns/with-annotations/ignore_link.md"
+ (fi, errs) <- parse GitHub file
+ getRefs fi @?=
+ ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"]
+ errs @?= makeError (Just $ PosInfo 42 1 42 31) file LinkErr
+ ]
+ , testGroup "\"ignore paragraph\" mode"
+ [ testCase "Check \"ignore paragraph\" performance" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph.md"
+ getRefs fi @?= ["blog", "contacts"]
+ errs @?= []
+ ]
+ , testGroup "\"ignore all\" mode"
+ [ testCase "Check \"ignore all\" performance" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file.md"
+ getRefs fi @?= []
+ errs @?= []
+ ]
+ ]
+ , testGroup "Check ignore copypaste check pragmas" $
+ [ testCase "Check ignore duplication check for link pragmas" $ do
+ let file = "tests/markdowns/with-annotations/ignore_link_cpc.md"
+ (fi, errs) <- parse GitHub file
+ getRefsWithCpc fi @?=
+ ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"]
+ errs @?= makeError (Just $ PosInfo 42 1 42 48) file LinkErrCpc
+ , testCase "Check ignore copypaste check for paragraph pragmas" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph_cpc.md"
+ getRefsWithCpc fi @?= ["blog", "contacts"]
+ errs @?= []
+ , testCase "Check ignore copypaste check in file performance" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file_cpc.md"
+ fi ^. fiCopyPasteCheck @?= False
+ errs @?= []
+ ]
]
where
getRefs :: FileInfo -> [Text]
getRefs fi = map rName $ fi ^. fiReferences
+ getRefsWithCpc :: FileInfo -> [Text]
+ getRefsWithCpc fi = map rName $ filter rCheckCopyPaste $ fi ^. fiReferences
+
getErrs :: FilePath -> IO [ScanError]
getErrs path = snd <$> parse GitHub path
diff --git a/tests/Test/Xrefcheck/TooManyRequestsSpec.hs b/tests/Test/Xrefcheck/TooManyRequestsSpec.hs
index 8b07d490..a61ae50e 100644
--- a/tests/Test/Xrefcheck/TooManyRequestsSpec.hs
+++ b/tests/Test/Xrefcheck/TooManyRequestsSpec.hs
@@ -63,7 +63,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
- (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
+ (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = ttTimeToCompletion <$> pTaskTimestamp
@@ -88,7 +88,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
- (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
+ (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = fromMaybe (sec 0) $ ttTimeToCompletion <$> pTaskTimestamp
@@ -114,7 +114,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
- (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
+ (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = ttTimeToCompletion <$> pTaskTimestamp
diff --git a/tests/Test/Xrefcheck/UtilRequests.hs b/tests/Test/Xrefcheck/UtilRequests.hs
index 8582cd8c..1285a600 100644
--- a/tests/Test/Xrefcheck/UtilRequests.hs
+++ b/tests/Test/Xrefcheck/UtilRequests.hs
@@ -62,7 +62,7 @@ checkLinkAndProgressWithServer mock link progress vrExpectation =
verifyLink :: Text -> IO (VerifyResult VerifyError, Progress Int)
verifyLink link = do
- let reference = Reference "" link Nothing (Position Nothing)
+ let reference = Reference "" link Nothing (Position Nothing) False
progRef <- newIORef $ initVerifyProgress [reference]
result <- verifyReferenceWithProgress reference progRef
p <- readIORef progRef
diff --git a/tests/golden/check-copy-paste/expected.gold b/tests/golden/check-copy-paste/expected.gold
index 796b4c73..b99f45ea 100644
--- a/tests/golden/check-copy-paste/expected.gold
+++ b/tests/golden/check-copy-paste/expected.gold
@@ -1,48 +1,72 @@
+=== Scan errors found ===
+
+ ➥ In file second-file.md
+ scan error at src:35:1-25:
+
+ Unrecognised option "no dh", perhaps you meant
+ "ignore "
+ or "no duplication check in "?
+
+ ➥ In file second-file.md
+ scan error at src:40:1-53:
+
+ Expected a PARAGRAPH after "no duplication check in paragraph" annotation, but found HEADING
+
+ ➥ In file second-file.md
+ scan error at src:46:1-48:
+
+ Expected a LINK after "no duplication check in link" annotation
+
+ ➥ In file second-file.md
+ scan error at src:51:1-48:
+
+ Annotation "no duplication check in file" must be at the top of markdown or right after comments at the top
+
+Scan errors dumped, 4 in total.
=== Possible copy/paste errors ===
➥ In file second-file.md
- reference (relative) at src:13:1-29:
+ reference (relative) at src:20:1-29:
- text: "Lol Kek"
- link: ./first-file.md
- anchor: -
is possibly a bad copy paste of
- reference (relative) at src:7:1-34:
+ reference (relative) at src:10:1-34:
- text: "First file"
- link: ./first-file.md
- anchor: -
➥ In file second-file.md
- reference (relative) at src:14:1-30:
+ reference (relative) at src:21:1-30:
- text: "Baz quux"
- link: ./first-file.md
- anchor: -
is possibly a bad copy paste of
- reference (relative) at src:7:1-34:
+ reference (relative) at src:10:1-34:
- text: "First file"
- link: ./first-file.md
- anchor: -
➥ In file second-file.md
- reference (relative) at src:24:1-29:
+ reference (relative) at src:31:1-29:
- text: "fdw"
- link: ./first-file.md
- anchor: chor
is possibly a bad copy paste of
- reference (relative) at src:23:1-32:
+ reference (relative) at src:30:1-32:
- text: "ff-cho"
- link: ./first-file.md
- anchor: chor
➥ In file second-file.md
- reference (external) at src:29:1-28:
+ reference (external) at src:70:1-28:
- text: "gitlab"
- link: https://github.com
- anchor: -
is possibly a bad copy paste of
- reference (external) at src:28:1-28:
+ reference (external) at src:69:1-28:
- text: "github"
- link: https://github.com
- anchor: -
Possible copy/paste errors dumped, 4 in total.
-All repository links are valid.
diff --git a/tests/golden/check-copy-paste/first-file.md b/tests/golden/check-copy-paste/first-file.md
index 92b3319f..2b4cf38c 100644
--- a/tests/golden/check-copy-paste/first-file.md
+++ b/tests/golden/check-copy-paste/first-file.md
@@ -4,6 +4,13 @@
- SPDX-License-Identifier: MPL-2.0
-->
+
+
+
+[ Second - ---file- ](./second-file.md)
+[ Link 2](./second-file.md)
+
# heading
# anch
diff --git a/tests/golden/check-copy-paste/second-file.md b/tests/golden/check-copy-paste/second-file.md
index 02f4d554..43297e4a 100644
--- a/tests/golden/check-copy-paste/second-file.md
+++ b/tests/golden/check-copy-paste/second-file.md
@@ -4,8 +4,15 @@
- SPDX-License-Identifier: MPL-2.0
-->
+
+
[ First file ](./first-file.md)
+
+
+[ Link 2](./first-file.md)
+
[ Link 3](./first-file.md#heading)
@@ -24,6 +31,45 @@
[ fdw](./first-file.md#chor)
+
+
+
+
+
+
+
+
+# asd
+
+
+
+
+
+# asd
+
+
+
+
+
+
+
+[ Link 3](./first-file.md)
+
+
+
+
+hello, how are you, bye
+
+
+
+[github](https://github.com)
+[gitlab](https://github.com)
+
[github](https://github.com)
[gitlab](https://github.com)
+
+
+
+[github](https://github.com)
+[gitlab](https://github.com)
diff --git a/tests/golden/check-scan-errors/expected.gold b/tests/golden/check-scan-errors/expected.gold
index a1933ffe..f6e58c5a 100644
--- a/tests/golden/check-scan-errors/expected.gold
+++ b/tests/golden/check-scan-errors/expected.gold
@@ -18,7 +18,9 @@
➥ In file check-scan-errors.md
scan error at src:21:1-50:
- Unrecognised option "ignore unrecognised-annotation" perhaps you meant <"ignore link"|"ignore paragraph"|"ignore all">
+ Unrecognised option "ignore unrecognised-annotation", perhaps you meant
+ "ignore "
+ or "no duplication check in "?
➥ In file check-second-file.md
scan error at src:9:1-29:
diff --git a/tests/markdowns/with-annotations/ignore_file_cpc.md b/tests/markdowns/with-annotations/ignore_file_cpc.md
new file mode 100644
index 00000000..691eadac
--- /dev/null
+++ b/tests/markdowns/with-annotations/ignore_file_cpc.md
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+Serokell [web-site](https://serokell.io/)
+Serokell [team](https://serokell.io/team)
+
+Serokell [blog](https://serokell.io/blog)
+
+Serokell [labs](https://serokell.io/labs)
+
+Serokell [contacts](https://serokell.io/contacts)
diff --git a/tests/markdowns/with-annotations/ignore_link_cpc.md b/tests/markdowns/with-annotations/ignore_link_cpc.md
new file mode 100644
index 00000000..b4e6e231
--- /dev/null
+++ b/tests/markdowns/with-annotations/ignore_link_cpc.md
@@ -0,0 +1,49 @@
+
+
+### Do not check the first link in the paragraph
+
+
+Serokell [web-site](https://serokell.io/)
+Serokell [team](https://serokell.io/team)
+
+
+
+Serokell [blog](https://serokell.io/blog)
+
+Serokell [labs](https://serokell.io/labs)
+
+Serokell
+[contacts](https://serokell.io/contacts) and again
+[team](https://serokell.io/team)
+
+### Do not check not the first link in the paragraph
+
+[team](https://serokell.io/team) again and [projects](https://serokell.io/projects)
+
+Also [hire-us](https://serokell.io/hire-us) and
+[fintech](https://serokell.io/fintech-development)
+development
+
+Here are [how-we-work](https://serokell.io/how-we-work) and [privacy](https://serokell.io/privacy)
+and [ml consulting](https://serokell.io/machine-learning-consulting)
+
+
+Do not check link bug _regression test_ [link1](link1) [link2](link2)
+
+
+Another no duplication check in link bug _some [link1](link1) emphasis_ [link2](link2)
+
+### Do not check pragma should be followed by
+
+
+
+This annotation expects link in paragraph right after it.
+
+So [link3](link3) is not checked for copypaste.
+
+Annotation inside paragraph allows
+softbreaks and __other *things*__ in paragraph, so [link4](link4) is checked for copypaste.
diff --git a/tests/markdowns/with-annotations/ignore_paragraph_cpc.md b/tests/markdowns/with-annotations/ignore_paragraph_cpc.md
new file mode 100644
index 00000000..633cc96e
--- /dev/null
+++ b/tests/markdowns/with-annotations/ignore_paragraph_cpc.md
@@ -0,0 +1,16 @@
+
+
+
+Serokell [web-site](https://serokell.io/)
+Serokell [team](https://serokell.io/team)
+
+Serokell [blog](https://serokell.io/blog)
+
+
+Serokell [labs](https://serokell.io/labs)
+
+Serokell [contacts](https://serokell.io/contacts)
diff --git a/tests/markdowns/with-annotations/no_link_cpc.md b/tests/markdowns/with-annotations/no_link_cpc.md
new file mode 100644
index 00000000..e1671e52
--- /dev/null
+++ b/tests/markdowns/with-annotations/no_link_cpc.md
@@ -0,0 +1,8 @@
+
+
+
+not a link
diff --git a/tests/markdowns/with-annotations/no_paragraph_cpc.md b/tests/markdowns/with-annotations/no_paragraph_cpc.md
new file mode 100644
index 00000000..03967d47
--- /dev/null
+++ b/tests/markdowns/with-annotations/no_paragraph_cpc.md
@@ -0,0 +1,9 @@
+
+
+
+
+# not a paragraph
diff --git a/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md b/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md
new file mode 100644
index 00000000..70981ebe
--- /dev/null
+++ b/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md
@@ -0,0 +1,11 @@
+
+
+the first paragraph
+
+
+
+the second paragraph