diff --git a/prefecture.py b/constants.py similarity index 97% rename from prefecture.py rename to constants.py index a519249..df4d9e8 100644 --- a/prefecture.py +++ b/constants.py @@ -1,4 +1,4 @@ -prefectures = [ +PREFECTURES = [ "北海道", "青森県", "岩手県", @@ -47,3 +47,4 @@ "鹿児島県", "沖縄県", ] + diff --git a/main.py b/main.py index 933110c..c9c3ddb 100644 --- a/main.py +++ b/main.py @@ -1,15 +1,25 @@ import pandas as pd import tabula import os -from prefecture import prefectures +from constants import PREFECTURES + + +def delete_headers(df, line_number): + if df.iloc[0, 0] == "基本情報": + return df.drop(df.index[:line_number]) + return df if not os.path.exists("./output_files"): os.mkdir("./output_files") for i in range(1, 47): + print("PREFECTURE_NUMBER", i) opendata_file = os.listdir(f"./data_files/shinryoujo_{i}") dfs = tabula.read_pdf(f"./data_files/shinryoujo_{i}/{opendata_file[0]}", lattice=True, pages='all', pandas_options={'header': None}) + # 1ページ目のみ「基本情報」行の削除のため1行指定、2ページ目以降は「基本情報」およびヘッダーを削除するため2行指定 + first_df = delete_headers(dfs[0], 1) + dfs = [delete_headers(x, 2) for x in dfs[1:]] + dfs.insert(0, first_df) merged_df = pd.concat(dfs).replace('\n', '', regex=True).replace('\r', '', regex=True).replace('\r\n', '', regex=True).replace('\n\r', '', regex=True) - merged_df.to_csv(f"./output_files/{prefectures[i-1]}.csv", index=None) - + merged_df.to_csv(f"./output_files/{PREFECTURES[i-1]}.csv", header=False, index=False)