diff --git a/Gemfile b/Gemfile index 1062328..2eb7462 100644 --- a/Gemfile +++ b/Gemfile @@ -9,6 +9,7 @@ group :optional do gem 'mechanize' gem 'mongo' gem 'redis' + gem 'roo', '~> 2.7.0' gem 'spreadsheet', '~> 1.1.1' gem 'sqlite3' end diff --git a/lib/daru/io/importers/excel.rb b/lib/daru/io/importers/excel.rb index caaff31..829c452 100644 --- a/lib/daru/io/importers/excel.rb +++ b/lib/daru/io/importers/excel.rb @@ -4,23 +4,32 @@ module Daru module IO module Importers class Excel < Base - Daru::DataFrame.register_io_module :from_excel, self + Daru::DataFrame.register_io_module :from_excel do |*args| + if args.first.end_with? '.xlsx' + require 'daru/io/importers/excelx' + Daru::IO::Importers::Excelx.new(*args).call + else + Daru::IO::Importers::Excel.new(*args).call + end + end - # Imports a +Daru::DataFrame+ from an Excel file. + # Imports a +Daru::DataFrame+ from an Excel file (.xls, or .xlsx formats) # - # @param path [String] Path of Excel file, where the - # DataFrame is to be imported from. - # @param worksheet_id [Interger] The index of the worksheet in the excel file, + # @param path [String] Path of Excel file, where the DataFrame is to be imported from. + # @param worksheet_id [Integer] The index of the worksheet in the excel file, # from where the +Daru::DataFrame+ will be imported. By default, the first - # worksheet has +worksheet_id+ as 0. In general, the n-th worksheet has + # worksheet has +:worksheet_id+ as 0. In general, the n-th worksheet has # its worksheet_id as n-1. # # If worksheet_id option is not given, it is taken as 0 by default and the # +Daru::DataFrame+ will be imported from the first worksheet in the excel file. + # @param headers [Boolean] Defaults to true. When set to true, first row of the + # given worksheet_id is used as the order of the Daru::DataFrame and data of + # the Dataframe consists of the remaining rows. # # @return A +Daru::DataFrame+ imported from the given excel worksheet # - # @example Reading from a default worksheet of an Excel file + # @example Reading from a default workworksheet_id of an Excel file # df = Daru::IO::Importers::Excel.new("test_xls.xls").call # df # @@ -45,22 +54,26 @@ class Excel < Base # # 3 4 Franz nil Paris nil # # 4 5 George 5.5 Tome a,b,c # # 5 6 Fernand nil nil nil - def initialize(path, worksheet_id: 0) + def initialize(path, worksheet_id: 0, headers: true) optional_gem 'spreadsheet', '~> 1.1.1' @path = path + @headers = headers @worksheet_id = worksheet_id end def call - book = Spreadsheet.open @path - worksheet = book.worksheet @worksheet_id - headers = ArrayHelper.recode_repeated(worksheet.row(0)).map(&:to_sym) + worksheet = Spreadsheet.open(@path).worksheet(@worksheet_id) + headers = if @headers + ArrayHelper.recode_repeated(worksheet.row(0)).map(&:to_sym) + else + (0..worksheet.row(0).to_a.size-1).to_a + end df = Daru::DataFrame.new({}) headers.each_with_index do |h,i| col = worksheet.column(i).to_a - col.delete_at 0 + col.delete_at(0) if @headers df[h] = col end diff --git a/lib/daru/io/importers/excelx.rb b/lib/daru/io/importers/excelx.rb new file mode 100644 index 0000000..e259b3d --- /dev/null +++ b/lib/daru/io/importers/excelx.rb @@ -0,0 +1,132 @@ +require 'daru/io/importers/base' + +module Daru + module IO + module Importers + class Excelx < Base + # Imports a +Daru::DataFrame+ from a given XLSX file and sheet. + # + # @param path [String] Local / Remote path to XLSX file + # @param sheet [String] Sheet name in the given XLSX file. Defaults to 0, + # to parse the dataframe from the first sheet. + # @param skiprows [Integer] Skips the first +:skiprows+ number of rows from the + # sheet being parsed. + # @param skipcols [Integer] Skips the first +:skipcols+ number of columns from the + # sheet being parsed. + # @param order [Boolean] Defaults to true. When set to true, first row of the + # given sheet is used as the order of the Daru::DataFrame and data of + # the Dataframe consists of the remaining rows. + # @param index [Boolean] Defaults to false. When set to true, first column of the + # given sheet is used as the index of the Daru::DataFrame and data of + # the Dataframe consists of the remaining columns. + # + # When set to false, a default order (0 to n-1) is chosen for the DataFrame, + # and the data of the DataFrame consists of all rows in the sheet. + # + # @return A +Daru::DataFrame+ imported from the given XLSX file and sheet + # + # @example Importing from a local file + # path = 'spec/fixtures/excelx/Stock-counts-sheet.xlsx' + # sheet = 'Example Stock Counts' + # df = Daru::IO::Importers::XLSX.new(path, sheet: sheet).call + # df + # + # #=> + # Status Stock coun Item code New Descriptio Stock coun Offset G/L + # 0 H 1 nil nil New stock 2014-08-01 nil + # 1 nil 1 IND300654 2 New stock 2014-08-01 51035 + # 2 nil 1 IND43201 5 New stock 2014-08-01 51035 + # 3 nil 1 OUT30045 3 New stock 2014-08-01 51035 + # ... ... ... ... ... ... ... ... + # + # @example Importing from a local file without headers + # path = 'spec/fixtures/excelx/Stock-counts-sheet.xlsx' + # sheet = 'Example Stock Counts' + # df = Daru::IO::Importers::XLSX.new(path, sheet: sheet, headers: false).call + # df + # + # #=> + # 0 1 2 3 4 5 6 + # 0 Status Stock coun Item code New Descriptio Stock coun Offset G/L + # 1 H 1 nil nil New stock 2014-08-01 nil + # 2 nil 1 IND300654 2 New stock 2014-08-01 51035 + # 3 nil 1 IND43201 5 New stock 2014-08-01 51035 + # 4 nil 1 OUT30045 3 New stock 2014-08-01 51035 + # ... ... ... ... ... ... ... ... + # + # @example Importing from a remote URL + # path = 'https://www.exact.com/uk/images/downloads/getting-started-excel-sheets/Stock-counts-sheet.xlsx' + # sheet = 'Example Stock Counts' + # df = Daru::IO::Importers::XLSX.new(path, sheet: sheet).call + # df + # + # #=> + # Status Stock coun Item code New Descriptio Stock coun Offset G/L + # 0 H 1 nil nil New stock 2014-08-01 nil + # 1 nil 1 IND300654 2 New stock 2014-08-01 51035 + # 2 nil 1 IND43201 5 New stock 2014-08-01 51035 + # 3 nil 1 OUT30045 3 New stock 2014-08-01 51035 + # ... ... ... ... ... ... ... ... + def initialize(path, sheet: 0, order: true, index: false, skiprows: 0, skipcols: 0) + optional_gem 'roo', '~> 2.7.0' + + @path = path + @sheet = sheet + @order = order + @index = index + @skiprows = skiprows + @skipcols = skipcols + end + + def call + book = Roo::Excelx.new(@path) + worksheet = book.sheet(@sheet) + + @data = strip_html_tags(skip_data(worksheet.to_a, @skiprows, @skipcols)) + @index = process_index + @order = process_order || (0..@data.first.length-1) + @data = process_data + + Daru::DataFrame.rows(@data, order: @order, index: @index) + end + + private + + def process_data + return skip_data(@data, 1, 1) if @order && @index + return skip_data(@data, 1, 0) if @order + return skip_data(@data, 0, 1) if @index + @data + end + + def process_index + return nil unless @index + @index = @data.transpose.first + @index = skip_data(@index, 1) if @order + @index + end + + def process_order + return nil unless @order + @order = @data.first + @order = skip_data(@order, 1) if @index + @order + end + + def skip_data(data, rows, cols=nil) + return data[rows..-1].map { |row| row[cols..-1] } unless cols.nil? + data[rows..-1] + end + + def strip_html_tags(data) + data.map do |row| + row.map do |ele| + next ele unless ele.is_a?(String) + ele.gsub(/<[^>]+>/, '') + end + end + end + end + end + end +end diff --git a/lib/daru/io/link.rb b/lib/daru/io/link.rb index c114df3..f40490a 100644 --- a/lib/daru/io/link.rb +++ b/lib/daru/io/link.rb @@ -1,7 +1,9 @@ module Daru class DataFrame class << self - def register_io_module(function, instance) + def register_io_module(function, instance=nil, &block) + return define_singleton_method(function, &block) if block_given? + if function.to_s.include? 'to' define_method(function) { |*args| instance.new(self, *args).call } else diff --git a/spec/daru/io/importers/excelx_spec.rb b/spec/daru/io/importers/excelx_spec.rb new file mode 100644 index 0000000..e642b97 --- /dev/null +++ b/spec/daru/io/importers/excelx_spec.rb @@ -0,0 +1,73 @@ +RSpec.describe Daru::IO::Importers::Excelx do + subject { described_class.new(path, opts).call } + + let(:opts) { {} } + + context 'when sheet is not specified' do + let(:path) { 'spec/fixtures/excelx/Microcode.xlsx' } + + it_behaves_like 'exact daru dataframe', + ncols: 32, + nrows: 37, + index: (0..36).to_a, + :'State.first' => 'FETCH0' + end + + context 'when sheet name is given' do + let(:path) { 'spec/fixtures/excelx/LOBSTAHS_rt.windows.xlsx' } + let(:opts) { {sheet: 'LOBSTAHS_rt.windows'} } + + it_behaves_like 'exact daru dataframe', + ncols: 3, + nrows: 93, + order: %w[lipid_class rt_win_max rt_win_min], + index: (0..92).to_a, + :'lipid_class.first' => 'DGCC' + end + + context 'when sheet contains nil elements' do + let(:path) { 'spec/fixtures/excelx/Stock-counts-sheet.xlsx' } + let(:opts) { {sheet: 2} } + + it_behaves_like 'exact daru dataframe', + ncols: 7, + nrows: 15, + order: [ + 'Status','Stock count number','Item code','New','Description', + 'Stock count date','Offset G/L Inventory' + ], + index: (0..14).to_a, + :'Item code.first' => nil, + :'Stock count number.first' => 1 + end + + context 'when skipping rows and columns' do + let(:path) { 'spec/fixtures/excelx/pivot.xlsx' } + let(:opts) { {sheet: 'Data1', skiprows: 2, skipcols: 1} } + + it_behaves_like 'exact daru dataframe', + ncols: 9, + nrows: 2155, + index: (0..2154).to_a, + :'Unit Price.first' => 14 + end + + before do + %w[LOBSTAHS_rt.windows Microcode Stock-counts-sheet].each do |file| + WebMock + .stub_request(:get,"http://dummy-remote-url/#{file}.xlsx") + .to_return(status: 200, body: File.read("spec/fixtures/excelx/#{file}.xlsx")) + WebMock.disable_net_connect!(allow: /dummy-remote-url/) + end + end + + context 'checks for equal parsing of local XLSX files and remote XLSX files' do + %w[LOBSTAHS_rt.windows Microcode Stock-counts-sheet].each do |file| + let(:local) { described_class.new("spec/fixtures/excelx/#{file}.xlsx").call } + let(:path) { "http://dummy-remote-url/#{file}.xlsx" } + + it { is_expected.to be_an(Daru::DataFrame) } + it { is_expected.to eq(local) } + end + end +end diff --git a/spec/fixtures/README.md b/spec/fixtures/README.md index 6669baa..7404fce 100644 --- a/spec/fixtures/README.md +++ b/spec/fixtures/README.md @@ -6,6 +6,13 @@ - `twitter.avro` : Downloaded from [here](https://github.com/miguno/avro-hadoop-starter/blob/master/src/test/resources/avro/twitter.avro). An AVRO schema with simple fields for Twitter users like `username`, `timestamp`, `tweet`. - `users.avro` : Downloaded from [here](https://github.com/apache/spark/blob/master/examples/src/main/resources/users.avro). A generic AVRO schema with fields like `name`, `favorite_color` and `favorite_numbers`. +### Excelx + +- `LOBSTAHS_rt.windows.xlsx` : Downloaded from [here](https://github.com/vanmooylipidomics/LOBSTAHS/blob/master/inst/doc/xlsx/LOBSTAHS_rt.windows.xlsx). Contains data about Lipid and Oxylipin Biomarker Screening Through Adduct Hierarchy Sequences (LOBSTASHS). Conatins two sheets called `LOBSTAHS_rt.windows` and `Notes`. +- `Microcode.xlsx` : Downloaded from [here](https://github.com/tkim371/CS2200/blob/master/proj2/project2/Microcode.xlsx). +- `Stock-counts-sheet.xlsx` : Downloaded from [here](https://www.exact.com/uk/images/downloads/getting-started-excel-sheets/Stock-counts-sheet.xlsx). Contains data about stocks. Helps in ensuring that HTML tags of cell attributes are striped off, while constructing the DataFrame. +- `pivot.xlsx` : Downloaded from [here](myy.haaga-helia.fi/~taaak/r/pivot.xlsx). Contains pivot tables, which requires `:skiprows` and `:skipcols` functionality. + ### JSON - `allsets.json` : An ultra-truncated version of the huge zip file available [here](http://mtgjson.com/json/AllSets.json.zip). Contains nested hashes. @@ -29,9 +36,10 @@ - `chicago.rds` : Contains data about city and temperatures. Can potentially create a `<6940*8> Daru::DataFrame`. Downloaded from [here](https://github.com/DataScienceSpecialization/courses/blob/master/03_GettingData/dplyr/chicago.rds) - `healthexp.rds` : Contains data comparing health expectancies among various countries. Can potentially create a `<3030*6> Daru::DataFrame`. Downloaded from [here](https://github.com/jcheng5/googleCharts/blob/master/inst/examples/bubble/healthexp.Rds) - `heights.rds` : Contains data as individual-wise, with attributes such as income, education, height, weight, etc. Can potentially create a `<3988*10> Daru::DataFrame`. Downloaded from [here](https://github.com/hadley/r4ds/blob/master/data/heights.RDS) -- `maacs_env.rds` : Contains data about Marine Air Command and Control System (MAACS) Environment. Can potentially create a `<750*27> Daru::DataFrame`. Downloaded from [here](https://github.com/DataScienceSpecialization/courses/blob/master/04_ExploratoryAnalysis/PlottingLattice/maacs_env.rds) +- `maacs_env.rds` : Contains data about Marine Air Command and Control System (MAACS) Environment. Can potentially create a `<750*27> Daru::DataFrame`. Downloaded from [here](https://github.com/DataScienceSpecialization/courses/blob/master/04_ExploratoryAnalysis/PlottingLattice/maacs_env.rds) - `RPPdataConverted.rds`: Contains data about author, citations and more of such fields. A fairly large dataset, which can potentially create a `<168*138> Daru::DataFrame`. Downloaded from [here](https://github.com/CenterForOpenScience/rpp/blob/master/data_allformats/RPPdataConverted.rds) + ### NOTE FOR FUTURE MAINTAINERS If you're having difficulty in finding fixtures files for a certain format, search in google for a specific filetype and keyword. The search url usually comes in a format like https://www.google.co.in/search?q=filetype:{filetype}+{keyword}. For example, https://www.google.co.in/search?q=filetype:avro+github diff --git a/spec/fixtures/excelx/LOBSTAHS_rt.windows.xlsx b/spec/fixtures/excelx/LOBSTAHS_rt.windows.xlsx new file mode 100644 index 0000000..6512cd1 Binary files /dev/null and b/spec/fixtures/excelx/LOBSTAHS_rt.windows.xlsx differ diff --git a/spec/fixtures/excelx/Microcode.xlsx b/spec/fixtures/excelx/Microcode.xlsx new file mode 100644 index 0000000..256ec23 Binary files /dev/null and b/spec/fixtures/excelx/Microcode.xlsx differ diff --git a/spec/fixtures/excelx/Stock-counts-sheet.xlsx b/spec/fixtures/excelx/Stock-counts-sheet.xlsx new file mode 100644 index 0000000..6a5475c Binary files /dev/null and b/spec/fixtures/excelx/Stock-counts-sheet.xlsx differ diff --git a/spec/fixtures/excelx/pivot.xlsx b/spec/fixtures/excelx/pivot.xlsx new file mode 100644 index 0000000..a7c2884 Binary files /dev/null and b/spec/fixtures/excelx/pivot.xlsx differ