Skip to content

Commit

Permalink
Fixing Outter merge
Browse files Browse the repository at this point in the history
  • Loading branch information
sonicfurqan committed Feb 13, 2020
1 parent be3239c commit 58a2e92
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 2 deletions.
3 changes: 3 additions & 0 deletions Data/MERGE/Log/Example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
REF KEY,ORIGIN,Comment
6,Master,Refrence Record not found in child org
8,Master,Refrence Record not found in child org
8 changes: 8 additions & 0 deletions Data/MERGE/Result/Example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Id,Name,Email,EXT_Id,SOURCE
6,Gb,,,Master
2,Bb,[email protected],2,Master/Child
3,Cb,[email protected],3,Master/Child
4,Db,[email protected],4,Master/Child
8,,[email protected],,Master
,Ab,[email protected],1,Child
,,,5,Child
5 changes: 3 additions & 2 deletions Support/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,15 @@ def __create_folder(FOLDER, SUBFOLDER):


def read(file_name):
encoding_europ = "ISO-8859-1"
encoding_europe = "ISO-8859-1"
encoding_default = "utf8"
try:
return pd.read_csv(
file_name, skip_blank_lines=True, sep=",", dtype=object, encoding=encoding_default)
except:
print("Fallback to europe Encoding")
return pd.read_csv(
file_name, skip_blank_lines=True, sep=",", dtype=object, encoding=encoding_europ)
file_name, skip_blank_lines=True, sep=",", dtype=object, encoding=encoding_europe)


def save(file_name, data):
Expand Down
18 changes: 18 additions & 0 deletions merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ def log(csv_data_frame):

# helper variables
child_coluums = CHILD_RECORDS.columns
DUPLICATE_RECORDS_FOUND = pd.DataFrame(columns=child_coluums)

FOUND_INDEXS = []

try:
for master_id in MASTER_RECORDS.index:
CHUNK_INDEX = CHUNK_INDEX + 1
Expand All @@ -173,6 +177,11 @@ def log(csv_data_frame):
if type(child_record_ref) == pd.core.frame.DataFrame:
child_record_ref = child_record_ref.iloc[0, :]

# removing found record commenting for large data set
FOUND_INDEXS.append(master_id)
# CHILD_RECORDS = CHILD_RECORDS.drop(
# child_record_ref[CHILD_ORG_REF_FIELD], axis=0,)

# comparing each cell value based on header
for field_name in child_coluums:
if pd.notnull(child_record_ref[field_name]):
Expand Down Expand Up @@ -205,6 +214,10 @@ def log(csv_data_frame):
master_record_ref, ignore_index=True)
# writing data to csv in chunks of 200 records to reduce memory
if CHUNK_INDEX == len(MASTER_RECORDS.index):
if len(FOUND_INDEXS) > 0:
CHILD_RECORDS.drop(FOUND_INDEXS, axis=0, inplace=True)
FOUND_INDEXS[:] = []

print("\n")
read_end = time.time()
print("CHUNK COUNT => %s " % CHUNK_INDEX)
Expand All @@ -213,6 +226,7 @@ def log(csv_data_frame):
print("Refrence not Found Count %s" % Duplicate_Not_Found_error)
print("Remaing Child records %s" % len(CHILD_RECORDS))
Duplicate_Not_Found_error = 0

if MERGE_TYPE == "outer":
temp = {FIELD_NAME_TO_STORE_SOURCE: "Child"}
CHILD_RECORDS = CHILD_RECORDS.assign(**temp)
Expand All @@ -222,6 +236,10 @@ def log(csv_data_frame):
MERGED_RECORDS = MERGED_RECORDS.iloc[0:0]
check_memory(CHUNK_INDEX)
elif CHUNK_INDEX == CHUNK:
if len(FOUND_INDEXS) > 0:
CHILD_RECORDS.drop(FOUND_INDEXS, axis=0, inplace=True)
FOUND_INDEXS[:] = []

print("\n")
read_end = time.time()
print("CHUNK COUNT => %s " % CHUNK)
Expand Down

0 comments on commit 58a2e92

Please sign in to comment.