This repository was archived by the owner on Oct 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_data_set.py
102 lines (82 loc) · 4.24 KB
/
build_data_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 17 10:34:50 2019
@author: GAllison
This script performs the overall task of creating a FracFocus database from
the raw excel collection and creating the tables used to make data sets.
Change the file handles at the top of core.Data_set_constructor to point to appropriate
directories.
Within Open-FF, there are three "working" FracFocus raw files. When new data is
downloaded from FracFocus, we put it in a file named "test_data.zip" (If we
are performing a tripwire scan, we move the previous "test_data.zip" into
"test_data_last.zip""). These two files are handled in the script:
"get_new_raw_file.py" and is performed frequently to document new disclosures.
That script also saves a copy of test_data.zip into an archive directory once
a week - usually the same day that the weekly blog post is generated.
The third working raw file in Open-FF is "curentData.zip". This is simply a
copy of "test_data.zip" made once I'm ready to start a curation process. This
separation of files allows Open-FF to continue to download data into test_data.zip
while I am working on the curation and repository process (which can take several
days). The currentData.zip file is typically what is used in this script to
build a file eventually ready for a repository.
# 3/2022 - separating Skytruth archive from main data
# 3/2022 - adding FFV1_scrape as an additional separate data set
"""
## For standard processing, set the following to 'bulk'
data_source = 'bulk' # can be 'bulk', 'FFV1_scrape' or 'SkyTruth'
# or 'NM_scrape_2022_05'
bulk_fn = 'currentData' # name of raw bulk archive file
construct_from_scratch = True # normally True
do_end_tests = True # normally True, only for 'bulk' set, ignore otherwise
make_output_files = False # True for final bulk runs, adds lots of compile time
do_abbrev = False # normally False, for some testing purposes
# used to control files to read from the bulk download. If less than full
# set, this process is performed in "test" mode.
startfile = 0 # 0 for full set
endfile = None # None for no upper limit
if (startfile!=0) | (endfile!=None) :
# test mode does not overwrite production mode pickles
mode = 'TEST'
print('\n'+30*'-'+ 'Performing in TEST mode!'+30*'-'+'\n')
else:
mode = 'PRODUCTION'
import core.Data_set_constructor as set_const
import core.Analysis_set as ana_set
def run_build(bulk_fn = bulk_fn,
mode=mode,
data_source=data_source,
make_output_files=make_output_files,
startfile=startfile,
endfile=endfile,
do_abbrev=do_abbrev,
do_end_tests=do_end_tests,
construct_from_scratch=construct_from_scratch):
if construct_from_scratch:
# this can be skipped when testing if pickles already made
t = set_const.Data_set_constructor(bulk_fn=bulk_fn, const_mode=mode,
make_files=make_output_files,
startfile=startfile,
endfile=endfile,
data_source=data_source,
abbreviated=do_abbrev)\
.create_full_set()
if do_end_tests&(mode=='PRODUCTION')&(data_source=='bulk'):
import core.Tests_of_final as tests
print('\nStarting tests of final product')
print(' Creating test set of FULL data')
df = ana_set.Full_set(bulk_fn=bulk_fn,
pkl_when_creating=False).get_set()
tests.final_test(df).run_all_tests()
if (make_output_files == True) &(data_source=='bulk'):
print('\n\n -- Generating output data sets\n')
ana_set.Standard_data_set(bulk_fn=bulk_fn,
pkl_when_creating=False).save_compressed()
ana_set.Full_set(bulk_fn=bulk_fn,
pkl_when_creating=False).save_compressed()
print('\nBuild completed\n')
try:
return t
except:
print('No data set constructor to return.')
if __name__ == '__main__':
t = run_build() # build using the defaults at the top.