Skip to content

Commit

Permalink
Docx reader: unwrap content of shaped textboxes...
Browse files Browse the repository at this point in the history
* #9214 text in shape format test document

* #9214 support Text in Shape Format

* #9214 remove irrelevant code
  • Loading branch information
StephanMeijer authored Nov 30, 2023
1 parent eff1790 commit 2e8ecb3
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 0 deletions.
33 changes: 33 additions & 0 deletions src/Text/Pandoc/Readers/Docx/Parse.hs
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,46 @@ mapD f xs =
in
concatMapM handler xs

isAltContentRun :: NameSpaces -> Element -> Bool
isAltContentRun ns element
| isElem ns "w" "r" element
, Just _altContentElem <- findChildByName ns "mc" "AlternateContent" element
= True
| otherwise
= False

-- Elements such as <w:shape> are not always preferred
-- to be unwrapped. Only if they are part of an AlternateContent
-- element, they should be unwrapped.
-- This strategy prevents VML images breaking.
unwrapAlternateContentElement :: NameSpaces -> Element -> [Element]
unwrapAlternateContentElement ns element
| isElem ns "mc" "AlternateContent" element
|| isElem ns "mc" "Fallback" element
|| isElem ns "w" "pict" element
|| isElem ns "v" "group" element
|| isElem ns "v" "rect" element
|| isElem ns "v" "roundrect" element
|| isElem ns "v" "shape" element
|| isElem ns "v" "textbox" element
|| isElem ns "w" "txbxContent" element
= concatMap (unwrapAlternateContentElement ns) (elChildren element)
| otherwise
= unwrapElement ns element

unwrapElement :: NameSpaces -> Element -> [Element]
unwrapElement ns element
| isElem ns "w" "sdt" element
, Just sdtContent <- findChildByName ns "w" "sdtContent" element
= concatMap (unwrapElement ns) (elChildren sdtContent)
| isElem ns "w" "r" element
, Just alternateContentElem <- findChildByName ns "mc" "AlternateContent" element
= unwrapAlternateContentElement ns alternateContentElem
| isElem ns "w" "smartTag" element
= concatMap (unwrapElement ns) (elChildren element)
| isElem ns "w" "p" element
, Just (modified, altContentRuns) <- extractChildren element (isAltContentRun ns)
= (unwrapElement ns modified) ++ concatMap (unwrapElement ns) altContentRuns
| otherwise
= [element{ elContent = concatMap (unwrapContent ns) (elContent element) }]

Expand Down
22 changes: 22 additions & 0 deletions src/Text/Pandoc/Readers/Docx/Util.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ module Text.Pandoc.Readers.Docx.Util (
, findChildrenByName
, findElementByName
, findAttrByName
, extractChildren
) where

import qualified Data.Text as T
import Data.Text (Text)
import Text.Pandoc.XML.Light
import qualified Data.Map as M
import Data.List (partition)

type NameSpaces = M.Map Text Text

Expand Down Expand Up @@ -67,3 +69,23 @@ findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findAttr (elemName ns' pref name) el


-- | Removes child elements that satisfy a given condition.
-- Returns the modified element and the list of removed children.
extractChildren :: Element -> (Element -> Bool) -> Maybe (Element, [Element])
extractChildren el condition
| null removedChildren = Nothing -- No children removed, return Nothing
| otherwise = Just (modifiedElement, removedChildren) -- Children removed, return Just
where
-- Separate the children based on the condition
(removedChildren, keptChildren) = partition condition (onlyElems' $ elContent el)

-- Helper function to filter only Element types from Content
onlyElems' :: [Content] -> [Element]
onlyElems' = foldr (\c acc -> case c of
Elem e -> e : acc
_ -> acc) []

-- Reconstruct the element with the kept children
modifiedElement = el { elContent = map Elem keptChildren }
4 changes: 4 additions & 0 deletions test/Tests/Readers/Docx.hs
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,10 @@ tests = [ testGroup "document"
"collapse overlapping targets (anchor spans)"
"docx/overlapping_targets.docx"
"docx/overlapping_targets.native"
, testCompare
"text in shape format"
"docx/text_in_shape_format.docx"
"docx/text_in_shape_format.native"
]
, testGroup "blocks"
[ testCompare
Expand Down
Binary file added test/docx/text_in_shape_format.docx
Binary file not shown.
33 changes: 33 additions & 0 deletions test/docx/text_in_shape_format.native
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[ Para
[ Str "Last"
, Space
, Str "update:"
, Space
, Str "May"
, Space
, Str "1,"
, Space
, Str "2017"
]
, Para
[ Str "U"
, Str "sing"
, Space
, Str "Microsoft"
, Space
, Str "Word"
, Space
, Str "2007/2010"
, LineBreak
, Str "for"
, Space
, Str "Writing"
, Space
, Str "Technical"
, Space
, Str "Documents"
]
, Para [ Str "Valter" , Space , Str "Kiisk" ]
, Para [ Str "Institute" , Space , Str "of" , Space , Str "Physics," , Space , Str "University" , Space , Str "of" , Space , Str "Tartu" ]
, Para []
]

0 comments on commit 2e8ecb3

Please sign in to comment.