Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SQUALL few-shot testing script (Codex) #9

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
modify preprocessing script
  • Loading branch information
NickSchoelkopf committed Apr 28, 2022
commit 51f406427460df7ad15b1a858db5559688102ed2
21 changes: 15 additions & 6 deletions preprocessing/preprocess_squall.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,13 @@ def process_squall_example(example: Dict[str, Any]) -> Dict[str, Any]:
if len(df_dict) != 1:
print(f"{len(df_dict)} tables found in {db_file_path}")


# get the original column dict
original_col_name_dict = json.load(open(COLUMN_DICT_FILE))[example["tbl"]]
# convert the column names to the original ones in the df_dict

df_dict_headers = {}
for table_name, df in df_dict.items():
df_dict[table_name].rename(columns=lambda x: original_col_name_dict[x], inplace=True)
df_dict[table_name].rename(columns=lambda x: original_col_name_dict[x] if x in original_col_name_dict.keys() else x, inplace=True)
df_dict_headers[table_name] = list(df.columns)
processed_example["db_table_headers"] = df_dict_headers

Expand Down Expand Up @@ -97,25 +99,32 @@ def build_column_name_dict(dataset: List[Dict[str, Any]]):
db_column_dict[example["tbl"]] = original_col_name_dict

print(f"Built the dict for {len(db_column_dict)} tables")

# dump to file
with open(COLUMN_DICT_FILE, "w") as f:
for table_id, original_col_name_dict in db_column_dict.items():
f.write(f"{table_id}: {original_col_name_dict}\n")
f.write(json.dumps(db_column_dict))
# for table_id, original_col_name_dict in db_column_dict.items():
# entry = {table_id: original_col_name_dict}
# f.write(json.dumps(entry)+"\n")

def preprocess_squall_dataset(dataset: List[Dict[str, Any]]):

processed_data = []
for example in tqdm(dataset):
processed_data.append(process_squall_example(example))

with open("data/squall/squall_processed.json", "w+") as f:
json.dump(processed_data, f)


def main():
# read the data
data = load_json(DATA_PATH)
# preprocess_squall_dataset(data)

build_column_name_dict(data)

preprocess_squall_dataset(data)


if __name__ == "__main__":
main()