Fixing Outter merge

sonicfurqan · Feb 13, 2020 · 58a2e92 · 58a2e92
1 parent be3239c
commit 58a2e92
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 2 deletions.
diff --git a/Data/MERGE/Log/Example.csv b/Data/MERGE/Log/Example.csv
@@ -0,0 +1,3 @@
+REF KEY,ORIGIN,Comment
+6,Master,Refrence Record not found in child org
+8,Master,Refrence Record not found in child org
diff --git a/Data/MERGE/Result/Example.csv b/Data/MERGE/Result/Example.csv
@@ -0,0 +1,8 @@
+Id,Name,Email,EXT_Id,SOURCE
+6,Gb,,,Master
+2,Bb,[email protected],2,Master/Child
+3,Cb,[email protected],3,Master/Child
+4,Db,[email protected],4,Master/Child
+8,,[email protected],,Master
+,Ab,[email protected],1,Child
+,,,5,Child
diff --git a/Support/utility.py b/Support/utility.py
@@ -66,14 +66,15 @@ def __create_folder(FOLDER, SUBFOLDER):
 
 
 def read(file_name):
-    encoding_europ = "ISO-8859-1"
+    encoding_europe = "ISO-8859-1"
     encoding_default = "utf8"
     try:
         return pd.read_csv(
             file_name, skip_blank_lines=True, sep=",", dtype=object, encoding=encoding_default)
     except:
+        print("Fallback to europe Encoding")
         return pd.read_csv(
-            file_name, skip_blank_lines=True, sep=",", dtype=object, encoding=encoding_europ)
+            file_name, skip_blank_lines=True, sep=",", dtype=object, encoding=encoding_europe)
 
 
 def save(file_name, data):

diff --git a/merge.py b/merge.py
@@ -163,6 +163,10 @@ def log(csv_data_frame):
 
 # helper variables
 child_coluums = CHILD_RECORDS.columns
+DUPLICATE_RECORDS_FOUND = pd.DataFrame(columns=child_coluums)
+
+FOUND_INDEXS = []
+
 try:
     for master_id in MASTER_RECORDS.index:
         CHUNK_INDEX = CHUNK_INDEX + 1
@@ -173,6 +177,11 @@ def log(csv_data_frame):
             if type(child_record_ref) == pd.core.frame.DataFrame:
                 child_record_ref = child_record_ref.iloc[0, :]
 
+            # removing found record commenting for large data set
+            FOUND_INDEXS.append(master_id)
+            # CHILD_RECORDS = CHILD_RECORDS.drop(
+            #    child_record_ref[CHILD_ORG_REF_FIELD], axis=0,)
+
             # comparing each cell value based on header
             for field_name in child_coluums:
                 if pd.notnull(child_record_ref[field_name]):
@@ -205,6 +214,10 @@ def log(csv_data_frame):
                 master_record_ref, ignore_index=True)
         # writing data to csv in chunks of 200 records to reduce memory
         if CHUNK_INDEX == len(MASTER_RECORDS.index):
+            if len(FOUND_INDEXS) > 0:
+                CHILD_RECORDS.drop(FOUND_INDEXS, axis=0, inplace=True)
+                FOUND_INDEXS[:] = []
+
             print("\n")
             read_end = time.time()
             print("CHUNK COUNT => %s " % CHUNK_INDEX)
@@ -213,6 +226,7 @@ def log(csv_data_frame):
             print("Refrence not Found Count %s" % Duplicate_Not_Found_error)
             print("Remaing Child records %s" % len(CHILD_RECORDS))
             Duplicate_Not_Found_error = 0
+
             if MERGE_TYPE == "outer":
                 temp = {FIELD_NAME_TO_STORE_SOURCE: "Child"}
                 CHILD_RECORDS = CHILD_RECORDS.assign(**temp)
@@ -222,6 +236,10 @@ def log(csv_data_frame):
             MERGED_RECORDS = MERGED_RECORDS.iloc[0:0]
             check_memory(CHUNK_INDEX)
         elif CHUNK_INDEX == CHUNK:
+            if len(FOUND_INDEXS) > 0:
+                CHILD_RECORDS.drop(FOUND_INDEXS, axis=0, inplace=True)
+                FOUND_INDEXS[:] = []
+
             print("\n")
             read_end = time.time()
             print("CHUNK COUNT => %s " % CHUNK)