diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index e83d764fd2e3..4dc9543cfa05 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -148,13 +148,46 @@ mapD f xs = in concatMapM handler xs +isAltContentRun :: NameSpaces -> Element -> Bool +isAltContentRun ns element + | isElem ns "w" "r" element + , Just _altContentElem <- findChildByName ns "mc" "AlternateContent" element + = True + | otherwise + = False + +-- Elements such as are not always preferred +-- to be unwrapped. Only if they are part of an AlternateContent +-- element, they should be unwrapped. +-- This strategy prevents VML images breaking. +unwrapAlternateContentElement :: NameSpaces -> Element -> [Element] +unwrapAlternateContentElement ns element + | isElem ns "mc" "AlternateContent" element + || isElem ns "mc" "Fallback" element + || isElem ns "w" "pict" element + || isElem ns "v" "group" element + || isElem ns "v" "rect" element + || isElem ns "v" "roundrect" element + || isElem ns "v" "shape" element + || isElem ns "v" "textbox" element + || isElem ns "w" "txbxContent" element + = concatMap (unwrapAlternateContentElement ns) (elChildren element) + | otherwise + = unwrapElement ns element + unwrapElement :: NameSpaces -> Element -> [Element] unwrapElement ns element | isElem ns "w" "sdt" element , Just sdtContent <- findChildByName ns "w" "sdtContent" element = concatMap (unwrapElement ns) (elChildren sdtContent) + | isElem ns "w" "r" element + , Just alternateContentElem <- findChildByName ns "mc" "AlternateContent" element + = unwrapAlternateContentElement ns alternateContentElem | isElem ns "w" "smartTag" element = concatMap (unwrapElement ns) (elChildren element) + | isElem ns "w" "p" element + , Just (modified, altContentRuns) <- extractChildren element (isAltContentRun ns) + = (unwrapElement ns modified) ++ concatMap (unwrapElement ns) altContentRuns | otherwise = [element{ elContent = concatMap (unwrapContent ns) (elContent element) }] diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index 0a869bba8691..f373c53304ee 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -21,12 +21,14 @@ module Text.Pandoc.Readers.Docx.Util ( , findChildrenByName , findElementByName , findAttrByName + , extractChildren ) where import qualified Data.Text as T import Data.Text (Text) import Text.Pandoc.XML.Light import qualified Data.Map as M +import Data.List (partition) type NameSpaces = M.Map Text Text @@ -67,3 +69,23 @@ findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = let ns' = ns <> elemToNameSpaces el in findAttr (elemName ns' pref name) el + + +-- | Removes child elements that satisfy a given condition. +-- Returns the modified element and the list of removed children. +extractChildren :: Element -> (Element -> Bool) -> Maybe (Element, [Element]) +extractChildren el condition + | null removedChildren = Nothing -- No children removed, return Nothing + | otherwise = Just (modifiedElement, removedChildren) -- Children removed, return Just + where + -- Separate the children based on the condition + (removedChildren, keptChildren) = partition condition (onlyElems' $ elContent el) + + -- Helper function to filter only Element types from Content + onlyElems' :: [Content] -> [Element] + onlyElems' = foldr (\c acc -> case c of + Elem e -> e : acc + _ -> acc) [] + + -- Reconstruct the element with the kept children + modifiedElement = el { elContent = map Elem keptChildren } \ No newline at end of file diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index d9935967f6c2..05b69abf618c 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -235,6 +235,10 @@ tests = [ testGroup "document" "collapse overlapping targets (anchor spans)" "docx/overlapping_targets.docx" "docx/overlapping_targets.native" + , testCompare + "text in shape format" + "docx/text_in_shape_format.docx" + "docx/text_in_shape_format.native" ] , testGroup "blocks" [ testCompare diff --git a/test/docx/text_in_shape_format.docx b/test/docx/text_in_shape_format.docx new file mode 100644 index 000000000000..763e62bf62c9 Binary files /dev/null and b/test/docx/text_in_shape_format.docx differ diff --git a/test/docx/text_in_shape_format.native b/test/docx/text_in_shape_format.native new file mode 100644 index 000000000000..09cfa932c61f --- /dev/null +++ b/test/docx/text_in_shape_format.native @@ -0,0 +1,33 @@ +[ Para + [ Str "Last" + , Space + , Str "update:" + , Space + , Str "May" + , Space + , Str "1," + , Space + , Str "2017" + ] +, Para + [ Str "U" + , Str "sing" + , Space + , Str "Microsoft" + , Space + , Str "Word" + , Space + , Str "2007/2010" + , LineBreak + , Str "for" + , Space + , Str "Writing" + , Space + , Str "Technical" + , Space + , Str "Documents" + ] +, Para [ Str "Valter" , Space , Str "Kiisk" ] +, Para [ Str "Institute" , Space , Str "of" , Space , Str "Physics," , Space , Str "University" , Space , Str "of" , Space , Str "Tartu" ] +, Para [] +] \ No newline at end of file