From e46a9d8b740786095d915adfd1973d058b927139 Mon Sep 17 00:00:00 2001 From: Sten Larsson Date: Wed, 9 Oct 2024 14:26:31 +0200 Subject: [PATCH 1/2] Parse XLSX files with unusual structure Normally you will find `workbook.xml` in the `xl` directory, but recently we got a file where it was located in the root. To find the actual location of workbook, we must parse the file `_rels/.rels`. A separate rels file located relative to the workbook then points out the locations of the sheets, styles and shared strings. --- lib/creek/book.rb | 34 ++++++++++++++++++++++++------ lib/creek/shared_strings.rb | 4 ++-- lib/creek/styles.rb | 4 ++-- spec/fixtures/test_structure.xlsx | Bin 0 -> 5238 bytes spec/test_spec.rb | 18 ++++++++++++++++ 5 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 spec/fixtures/test_structure.xlsx diff --git a/lib/creek/book.rb b/lib/creek/book.rb index fae6090..1be15be 100644 --- a/lib/creek/book.rb +++ b/lib/creek/book.rb @@ -8,7 +8,9 @@ class Creek::Book attr_reader :files, :sheets, :shared_strings, - :with_headers + :with_headers, + :workbook_rels_by_type, + :workbook_rels_by_id DATE_1900 = Date.new(1899, 12, 30).freeze DATE_1904 = Date.new(1904, 1, 1).freeze @@ -21,12 +23,14 @@ def initialize path, options = {} end path = download_file(path) if options[:remote] @files = Zip::File.open(path) + parse_workbook_path + parse_workbook_rels @shared_strings = SharedStrings.new(self) @with_headers = options.fetch(:with_headers, false) end def sheets - doc = @files.file.open "xl/workbook.xml" + doc = @files.file.open @workbook_path xml = Nokogiri::XML::Document.parse doc namespaces = xml.namespaces @@ -37,10 +41,8 @@ def sheets end end - rels_doc = @files.file.open "xl/_rels/workbook.xml.rels" - rels = Nokogiri::XML::Document.parse(rels_doc).css("Relationship") @sheets = xml.css(cssPrefix+'sheet').map do |sheet| - sheetfile = rels.find { |el| sheet.attr("r:id") == el.attr("Id") }.attr("Target") + sheetfile = @workbook_rels_by_id[sheet.attr("r:id")] sheet = Sheet.new( self, sheet.attr("name"), @@ -71,7 +73,7 @@ def base_date # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx result = DATE_1900 # default - doc = @files.file.open "xl/workbook.xml" + doc = @files.file.open @workbook_path xml = Nokogiri::XML::Document.parse doc xml.css('workbookPr[date1904]').each do |workbookPr| if workbookPr['date1904'] =~ /true|1/i @@ -98,5 +100,25 @@ def download_file(url) downloaded.path end end + + def parse_workbook_path + rels_file = @files.file.open '_rels/.rels' + rels_xml = Nokogiri::XML::Document.parse(rels_file).css('Relationship') + rel = rels_xml.find { |el| el.attr('Type') == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" } + @workbook_path = rel.attr('Target') + end + + def parse_workbook_rels + @workbook_rels_by_id = {} + @workbook_rels_by_type = {} + workbook_dirname, slash, workbook_basename = @workbook_path.rpartition('/') + workbook_rels_file = @files.file.open "#{workbook_dirname}#{slash}_rels/#{workbook_basename}.rels" + Nokogiri::XML::Document.parse(workbook_rels_file).css('Relationship').each do |rel| + target = rel.attr('Target') + target = "#{workbook_dirname}#{slash}#{target}" unless target.start_with?('/') + @workbook_rels_by_id[rel.attr('Id')] = target + @workbook_rels_by_type[rel.attr('Type')] = target + end + end end end diff --git a/lib/creek/shared_strings.rb b/lib/creek/shared_strings.rb index dd5c922..561d426 100644 --- a/lib/creek/shared_strings.rb +++ b/lib/creek/shared_strings.rb @@ -13,8 +13,8 @@ def initialize book end def parse_shared_shared_strings - path = "xl/sharedStrings.xml" - if @book.files.file.exist?(path) + path = @book.workbook_rels_by_type["http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"] + if path doc = @book.files.file.open path xml = Nokogiri::XML::Document.parse doc parse_shared_string_from_document(xml) diff --git a/lib/creek/styles.rb b/lib/creek/styles.rb index d4681e9..ffbba36 100644 --- a/lib/creek/styles.rb +++ b/lib/creek/styles.rb @@ -6,12 +6,12 @@ def initialize(book) end def path - "xl/styles.xml" + @book.workbook_rels_by_type["http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"] end def styles_xml @styles_xml ||= begin - if @book.files.file.exist?(path) + if path doc = @book.files.file.open path Nokogiri::XML::Document.parse doc end diff --git a/spec/fixtures/test_structure.xlsx b/spec/fixtures/test_structure.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9b21b95575415d8f7a24fbac80bd61f3ab937ed2 GIT binary patch literal 5238 zcmb7Ic{tQv8y@S}4cQ7=5@jblk+Ef`j5Wg8nK2lu;b>gd=V24efhtpvusle{bMt?hJ`jj= zxgfSmYhMo+(VEgaXiib7a#(35=a9q>T?j8C8y{&nwHthq>0->NTb3ya?IuUp``qln zve3-d3y<8(wV!{M7bR7OyeYlcZrxOZ`+$DH*R)gr&M{i4;2k$#Vo0{f5JiTe)zjJ4(bC!3@!$B2yD0=m36d%8WP)GKJbnkl4`1aq z@(C&@fDTuIVnh@R?44Pp!V}`N@{d&gKSFwPB$}MnPK7#-!AKFEWyl}_(J9v>4?xk$ zMwD)v!|vHI=wIU!2GeIvpRya+Qk9C5Nop7DOiTxDNDvi7-b zt|?XRP~o8xbqbHHF3(1!XbsJ14MF^f3@yN{Dxt14WSOASBbm@o<7nNqo z(hX6!uGHS2XqPfSb79K%Zu7$yxiPmc@#Fw^$mf%d5C5tX6Q}&a4a^yAgy_&TbHEl5 z0{}pb;f7!F>h-HgXIJYlg_bRPXfuf!zZBYPn!GwnkS6eY(~@xbvyhH>oC>^U8l{%o zQb&Qb7d7tiF&mdYF_h+sm4CV)ad7tY=}9zg$s|R#*yat;LxZ7w;;#Q&|U6HOE&bd>IP?CCwXqW-AmSW;2w|osQg0zi$$d>d&D5C^btaSM~Lau%bSfY$!^{r z%DU(BeA5o;gIS4!cL^5>XRfUZ5(OJ7L~qI8o596-J_S{s`z5=`j=0Dn1vjlO*R{YQvN)^%5FCpUpF znQF(#0e1Y7sUOM4>PvelKkpOF`bJE`>A$c4`8e=n5x;B99{_7_HGe%5eqoPV3zOK0 zNcxSMkbKp^Pz^bd5w+stTw6sn0A`GQJMBA@3)zcvt!*udZ7x#4j+ac((mjXXGDqb zH;Ut)%25Ujf(X{9wOtxbNhrD*NMd42EW~S>0&q9e>mvPprMX=YEiF6#y=b)KsX5ib z9VZSO)#yvgZqph1^GxZ7#O*+92hnP_C?0R#m}v^uw&(Z3nS15+r*ok#o&5N|bREm_ zoahsER+dAR8G4aLj$sfV9lj`q^wVLVL@Fa!2*Vv}H!1ryhaHh;yq zaPY5$12=9dQq38wBl|BZ=q}7Jcg>R@-?io7enK^7$NoS7fC95?H>9_dwVR!_HS*u& zaTU@F?+^sF?(p!=#^AxI%82eiW8Wf+Fe7)N>#gFOtXWu-)*^B__W-oy`oOQ!)_W~3 z>!h{JG@7YZ2qt_UJheu^AXJs0%b5srhWTCWcx=ec-dAAG#-G+c+&e-GhC)dPQ_e%_ zaH0g+w8|iqp=p|Zt`0s|gm4lJmokk7-NCU#vC7>+u3Cihf*g`h@UsDII^98z!cB7x@+i!ucNJ$) znJb;{uzebhul$j>L$ff1)2+8aS;{Cvf+*Ye=|)$j-nyZ`ma|#;?6B{t#*qR8*jA63 zI}^N%8b?f34=_17r3&5So=T@=e)-V&6qv*9AP3pOifUGpQkyJEshIk=Y-Y@;3< zna~E-Z0C7ujzq8a;T!bx!>dp#?kgeb%qwfQZIU)?EqYrG{&xC4pB?v0(&0B9_tlz4 z8g02e#O9PnD}aMfCelROp7a8Ls0r#9HNnGV-plE$luTn}1UvpwO0f5aDRrI1BtGWe zVEuZhVix~zSBH^_9TS8>c2;#_KxBB4gOf80K2iIi?Bz43L@`ark_;08RVesr`@k$!|i=*_m}-khWDnl@a_( z!S0QTO`BWJ#+u)(WTp|A$avW2A{!(W6oo$!-3wuhTE2cam~@X|CI^(NK^+nxlTAX! zt{4eo1D7hYvfAIdA%~fW3m$B-aZB)f@ta5ba5o@ySvV#Bo_gpZ?wP9!o?xP7qL-ns zG^}BzU;%lCOWX@2jwgvfD!Qc*+HmjHq%CB>%C|W?oU(Y5F9j{`arm%-dv(87dM1L;lTlEsaxsF#|bb`jmkC!yA4V-q#H z(Z)uDTgAo|#b@4y?wL<)p0*TCyvWXijokpy9>=zlw0g~NaUgy%~cut4UroMAvo#CJTxs}v+>G#Z$<7?=|od}S{a^JZOUaGtg$dUTymwwr60 zCTEW4TBqmk>9%?8gD?WM2jI~m_nJg~-DLP(X**_EAzw|>_OluDB@yS+DjtR03DMZI zbmTtom%5{wWRMM4C9CJ-wpN=SsnLoXDBlPWU2|^a?Yh+L-N-K{ueE2ed$#BxQ#j?m zM>Zo|A>VlX_Ri}oITRn`yNP=}&+(8LJL}h7o)UI~+?p<&;&=J0azwsPtXg4y+e#!A zvUDa+Se%WAsJb>`y1El&waxv>3a+BhU{#Xbzc(kN7sad%l=%c zk8$<$7|1dZjFxs_CLrQgNi*p*Lx-SpWA}|P78HkuM!Q0K$pOCBJe5pz&$>kRE46@d z$7zFoQZFijIqOW+vKOB7{Kb}=8@nJDm(_qE?}Nac5B=0io&tAv3(z!K@3hBLpe$Ue zH~U8^NpOolK0RXYZ#$x-V()Y(qp91euTMl9NF0?iqel<-C>ACJa4X+ZqG)&6S*MOr zhjLR#y>dU)b>|m##ontJfy5fEFE{Joo(+sK6WH;OLW{+507Sy7F|Gt_(6OHFIL>j+ z#e)68p{%A!Q(!J?McX)1brJsa(+B|ET&b710YG4Icjr0V1XVT76`ixe-icfK(E;{M z+JLe3hU^IFea7hcc`j>Bg7?l=6TIo4!>O zudlj(_e|DX4f>{`Cr=dm8DwL57*9kCg&E#kHmKn3oJAned!OViM2|?>{25|iyECm* z;P+Ehcq}i4Z>e8e&j~AnJKf`%o@yc@s-pd3d%w2Dn+UMci#p43VvI!*#sdK8extz3 z0%`Hxs>FyN+C*c=KPcXfV&b-x`#*4`}tx#F34APFRgjH0$ z=#3baV5L?+am~q`HC-&T2Y#vKb)WXzp(pLYlJi}N1s9V0p9reYq=KhhOX5OA;)QBA zeKmce9uR%*1n-~lqkpipXla0#M;OZ(hq5Jk#(ly>306T&TcrO%a^roh?_<~Q+zbMm zS3OKl3Bm8omT;Tf9`)gVLI#;0p}4bOWEtHK;O!eQKc8P3TDoA`YaKt6%(Qqnw}lD$ zmvKyD!wPiEMpDTo`p8VBE1<6R5LZVX2lq7IKfO3$N(}SEFb${)!hZen{ej@?`@=5` z{|bD(=l&Vk0E7Q?%K_2*cf{X2)vt)yAoGLVFwZ4F9Rv&eS-r5R^?eS zw&q)Bg~9qm6ZGq5kM~tSZ`S-f*3Vo1yQlc~ePf%p<88^$;Qj>P0{y=Lk30UK0g;4& z-~9i73-T+QzkA7_*@P1TzB)?`&>vmpkAnQ&SAGM+X1U}m(EpX_--Y-aA~w+jzf$2oQGmA2i4vL;wH) literal 0 HcmV?d00001 diff --git a/spec/test_spec.rb b/spec/test_spec.rb index dabb932..1dc1619 100644 --- a/spec/test_spec.rb +++ b/spec/test_spec.rb @@ -166,3 +166,21 @@ expect(rows.map{|r| r['cells']}).to eq(@expected_rows) end end + +describe 'Creek parsing a file with unusual structure.' do + before(:all) do + @creek = Creek::Book.new 'spec/fixtures/test_structure.xlsx' + @expected_rows = [ + {"A1"=>"date", "B1"=>"dimension", "C1"=>"metric_1", "D1"=>"metric_2", "E1"=>"metric_3"}, + {"A2"=>"2022-02-27", "B2"=>"A", "C2"=>"1", "D2"=>"5.30", "E2"=>"11:11:11"} + ] + end + + after(:all) do + @creek.close + end + + it 'Parse rows successfully.' do + expect(@creek.sheets[0].rows.to_a).to eq(@expected_rows) + end +end From a63f0a5d6911b6ffaccc3bddce678cc577689e19 Mon Sep 17 00:00:00 2001 From: Sten Larsson Date: Wed, 27 Nov 2024 09:27:18 +0100 Subject: [PATCH 2/2] Handle files missing _rels/.rels We found an XLSX missing the `_rels/.rels` file, and in this case falling back to the default path worked. --- lib/creek/book.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/creek/book.rb b/lib/creek/book.rb index 1be15be..e1a45be 100644 --- a/lib/creek/book.rb +++ b/lib/creek/book.rb @@ -106,6 +106,8 @@ def parse_workbook_path rels_xml = Nokogiri::XML::Document.parse(rels_file).css('Relationship') rel = rels_xml.find { |el| el.attr('Type') == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" } @workbook_path = rel.attr('Target') + rescue Errno::ENOENT + @workbook_path = 'xl/workbook.xml' end def parse_workbook_rels