-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #28 from athityakumar/xlsx-importer
XLSX Importer
- Loading branch information
Showing
10 changed files
with
243 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
require 'daru/io/importers/base' | ||
|
||
module Daru | ||
module IO | ||
module Importers | ||
class Excelx < Base | ||
# Imports a +Daru::DataFrame+ from a given XLSX file and sheet. | ||
# | ||
# @param path [String] Local / Remote path to XLSX file | ||
# @param sheet [String] Sheet name in the given XLSX file. Defaults to 0, | ||
# to parse the dataframe from the first sheet. | ||
# @param skiprows [Integer] Skips the first +:skiprows+ number of rows from the | ||
# sheet being parsed. | ||
# @param skipcols [Integer] Skips the first +:skipcols+ number of columns from the | ||
# sheet being parsed. | ||
# @param order [Boolean] Defaults to true. When set to true, first row of the | ||
# given sheet is used as the order of the Daru::DataFrame and data of | ||
# the Dataframe consists of the remaining rows. | ||
# @param index [Boolean] Defaults to false. When set to true, first column of the | ||
# given sheet is used as the index of the Daru::DataFrame and data of | ||
# the Dataframe consists of the remaining columns. | ||
# | ||
# When set to false, a default order (0 to n-1) is chosen for the DataFrame, | ||
# and the data of the DataFrame consists of all rows in the sheet. | ||
# | ||
# @return A +Daru::DataFrame+ imported from the given XLSX file and sheet | ||
# | ||
# @example Importing from a local file | ||
# path = 'spec/fixtures/excelx/Stock-counts-sheet.xlsx' | ||
# sheet = 'Example Stock Counts' | ||
# df = Daru::IO::Importers::XLSX.new(path, sheet: sheet).call | ||
# df | ||
# | ||
# #=> <Daru::DataFrame(15x7)> | ||
# Status Stock coun Item code New Descriptio Stock coun Offset G/L | ||
# 0 H 1 nil nil New stock 2014-08-01 nil | ||
# 1 nil 1 IND300654 2 New stock 2014-08-01 51035 | ||
# 2 nil 1 IND43201 5 New stock 2014-08-01 51035 | ||
# 3 nil 1 OUT30045 3 New stock 2014-08-01 51035 | ||
# ... ... ... ... ... ... ... ... | ||
# | ||
# @example Importing from a local file without headers | ||
# path = 'spec/fixtures/excelx/Stock-counts-sheet.xlsx' | ||
# sheet = 'Example Stock Counts' | ||
# df = Daru::IO::Importers::XLSX.new(path, sheet: sheet, headers: false).call | ||
# df | ||
# | ||
# #=> <Daru::DataFrame(16x7)> | ||
# 0 1 2 3 4 5 6 | ||
# 0 Status Stock coun Item code New Descriptio Stock coun Offset G/L | ||
# 1 H 1 nil nil New stock 2014-08-01 nil | ||
# 2 nil 1 IND300654 2 New stock 2014-08-01 51035 | ||
# 3 nil 1 IND43201 5 New stock 2014-08-01 51035 | ||
# 4 nil 1 OUT30045 3 New stock 2014-08-01 51035 | ||
# ... ... ... ... ... ... ... ... | ||
# | ||
# @example Importing from a remote URL | ||
# path = 'https://www.exact.com/uk/images/downloads/getting-started-excel-sheets/Stock-counts-sheet.xlsx' | ||
# sheet = 'Example Stock Counts' | ||
# df = Daru::IO::Importers::XLSX.new(path, sheet: sheet).call | ||
# df | ||
# | ||
# #=> <Daru::DataFrame(15x7)> | ||
# Status Stock coun Item code New Descriptio Stock coun Offset G/L | ||
# 0 H 1 nil nil New stock 2014-08-01 nil | ||
# 1 nil 1 IND300654 2 New stock 2014-08-01 51035 | ||
# 2 nil 1 IND43201 5 New stock 2014-08-01 51035 | ||
# 3 nil 1 OUT30045 3 New stock 2014-08-01 51035 | ||
# ... ... ... ... ... ... ... ... | ||
def initialize(path, sheet: 0, order: true, index: false, skiprows: 0, skipcols: 0) | ||
optional_gem 'roo', '~> 2.7.0' | ||
|
||
@path = path | ||
@sheet = sheet | ||
@order = order | ||
@index = index | ||
@skiprows = skiprows | ||
@skipcols = skipcols | ||
end | ||
|
||
def call | ||
book = Roo::Excelx.new(@path) | ||
worksheet = book.sheet(@sheet) | ||
|
||
@data = strip_html_tags(skip_data(worksheet.to_a, @skiprows, @skipcols)) | ||
@index = process_index | ||
@order = process_order || (0..@data.first.length-1) | ||
@data = process_data | ||
|
||
Daru::DataFrame.rows(@data, order: @order, index: @index) | ||
end | ||
|
||
private | ||
|
||
def process_data | ||
return skip_data(@data, 1, 1) if @order && @index | ||
return skip_data(@data, 1, 0) if @order | ||
return skip_data(@data, 0, 1) if @index | ||
@data | ||
end | ||
|
||
def process_index | ||
return nil unless @index | ||
@index = @data.transpose.first | ||
@index = skip_data(@index, 1) if @order | ||
@index | ||
end | ||
|
||
def process_order | ||
return nil unless @order | ||
@order = @data.first | ||
@order = skip_data(@order, 1) if @index | ||
@order | ||
end | ||
|
||
def skip_data(data, rows, cols=nil) | ||
return data[rows..-1].map { |row| row[cols..-1] } unless cols.nil? | ||
data[rows..-1] | ||
end | ||
|
||
def strip_html_tags(data) | ||
data.map do |row| | ||
row.map do |ele| | ||
next ele unless ele.is_a?(String) | ||
ele.gsub(/<[^>]+>/, '') | ||
end | ||
end | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
RSpec.describe Daru::IO::Importers::Excelx do | ||
subject { described_class.new(path, opts).call } | ||
|
||
let(:opts) { {} } | ||
|
||
context 'when sheet is not specified' do | ||
let(:path) { 'spec/fixtures/excelx/Microcode.xlsx' } | ||
|
||
it_behaves_like 'exact daru dataframe', | ||
ncols: 32, | ||
nrows: 37, | ||
index: (0..36).to_a, | ||
:'State.first' => 'FETCH0' | ||
end | ||
|
||
context 'when sheet name is given' do | ||
let(:path) { 'spec/fixtures/excelx/LOBSTAHS_rt.windows.xlsx' } | ||
let(:opts) { {sheet: 'LOBSTAHS_rt.windows'} } | ||
|
||
it_behaves_like 'exact daru dataframe', | ||
ncols: 3, | ||
nrows: 93, | ||
order: %w[lipid_class rt_win_max rt_win_min], | ||
index: (0..92).to_a, | ||
:'lipid_class.first' => 'DGCC' | ||
end | ||
|
||
context 'when sheet contains nil elements' do | ||
let(:path) { 'spec/fixtures/excelx/Stock-counts-sheet.xlsx' } | ||
let(:opts) { {sheet: 2} } | ||
|
||
it_behaves_like 'exact daru dataframe', | ||
ncols: 7, | ||
nrows: 15, | ||
order: [ | ||
'Status','Stock count number','Item code','New','Description', | ||
'Stock count date','Offset G/L Inventory' | ||
], | ||
index: (0..14).to_a, | ||
:'Item code.first' => nil, | ||
:'Stock count number.first' => 1 | ||
end | ||
|
||
context 'when skipping rows and columns' do | ||
let(:path) { 'spec/fixtures/excelx/pivot.xlsx' } | ||
let(:opts) { {sheet: 'Data1', skiprows: 2, skipcols: 1} } | ||
|
||
it_behaves_like 'exact daru dataframe', | ||
ncols: 9, | ||
nrows: 2155, | ||
index: (0..2154).to_a, | ||
:'Unit Price.first' => 14 | ||
end | ||
|
||
before do | ||
%w[LOBSTAHS_rt.windows Microcode Stock-counts-sheet].each do |file| | ||
WebMock | ||
.stub_request(:get,"http://dummy-remote-url/#{file}.xlsx") | ||
.to_return(status: 200, body: File.read("spec/fixtures/excelx/#{file}.xlsx")) | ||
WebMock.disable_net_connect!(allow: /dummy-remote-url/) | ||
end | ||
end | ||
|
||
context 'checks for equal parsing of local XLSX files and remote XLSX files' do | ||
%w[LOBSTAHS_rt.windows Microcode Stock-counts-sheet].each do |file| | ||
let(:local) { described_class.new("spec/fixtures/excelx/#{file}.xlsx").call } | ||
let(:path) { "http://dummy-remote-url/#{file}.xlsx" } | ||
|
||
it { is_expected.to be_an(Daru::DataFrame) } | ||
it { is_expected.to eq(local) } | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.