Skip to content

Commit

Permalink
Fix header
Browse files Browse the repository at this point in the history
  • Loading branch information
ryo-ma committed Feb 18, 2024
1 parent d778b0e commit 7372112
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
3 changes: 2 additions & 1 deletion prefecture.py → constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
prefectures = [
PREFECTURES = [
"北海道",
"青森県",
"岩手県",
Expand Down Expand Up @@ -47,3 +47,4 @@
"鹿児島県",
"沖縄県",
]

16 changes: 13 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
import pandas as pd
import tabula
import os
from prefecture import prefectures
from constants import PREFECTURES


def delete_headers(df, line_number):
if df.iloc[0, 0] == "基本情報":
return df.drop(df.index[:line_number])
return df


if not os.path.exists("./output_files"):
os.mkdir("./output_files")

for i in range(1, 47):
print("PREFECTURE_NUMBER", i)
opendata_file = os.listdir(f"./data_files/shinryoujo_{i}")
dfs = tabula.read_pdf(f"./data_files/shinryoujo_{i}/{opendata_file[0]}", lattice=True, pages='all', pandas_options={'header': None})
# 1ページ目のみ「基本情報」行の削除のため1行指定、2ページ目以降は「基本情報」およびヘッダーを削除するため2行指定
first_df = delete_headers(dfs[0], 1)
dfs = [delete_headers(x, 2) for x in dfs[1:]]
dfs.insert(0, first_df)
merged_df = pd.concat(dfs).replace('\n', '', regex=True).replace('\r', '', regex=True).replace('\r\n', '', regex=True).replace('\n\r', '', regex=True)
merged_df.to_csv(f"./output_files/{prefectures[i-1]}.csv", index=None)

merged_df.to_csv(f"./output_files/{PREFECTURES[i-1]}.csv", header=False, index=False)

0 comments on commit 7372112

Please sign in to comment.