-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_headers.py
30 lines (28 loc) · 1.32 KB
/
get_headers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pandas as pd
import os
# Loop over all files in the input_directory
def getHeaders(input_directory):
with open(os.path.join(input_directory,"metadata.txt"),'w') as f:
unique = set()
for filename in os.listdir(input_directory):
# Check if the file is an Excel file
print("processing "+filename)
if (filename.endswith(".xlsx") or filename.endswith(".xls")):
# Construct the full file path
file_path = os.path.join(input_directory, filename)
# Read the Excel file
try:
# Load only the first row (header) by setting nrows to 0
df = pd.read_excel(file_path, nrows=10)
# Print the filename and its headers
f.write(f"Headers in '{filename}':\n")
f.write(str(df.columns.tolist())+"\n")
unique.update(df.columns.tolist())
except Exception as e:
# If there was an error reading the Excel file, print the error
f.write(f"Error reading {filename}: {e}\n")
f.write("-" * 40+"\n")
f.write("unique fields = " + str(unique))
print("finished processing!")
print(f"unique fields for {input_directory}= " + str(unique))
print('-'*40)