From f66a528bc2835859cd04b969c67156537c25afe7 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 15 May 2018 23:28:47 +0200 Subject: [PATCH] fix Duplicate, save list of duplicates on disk + prevent empty hash creation --- bin/Duplicates.py | 11 ++++-- bin/LAUNCH.sh | 2 -- bin/packages/Paste.py | 36 +++++++++----------- bin/packages/config.cfg.sample | 5 +++ var/www/modules/Flask_config.py | 12 +++++++ var/www/modules/showpaste/Flask_showpaste.py | 17 +++++---- 6 files changed, 52 insertions(+), 31 deletions(-) diff --git a/bin/Duplicates.py b/bin/Duplicates.py index 58bad3f0..0c24bec1 100755 --- a/bin/Duplicates.py +++ b/bin/Duplicates.py @@ -158,7 +158,11 @@ # Adding hashes in Redis for hash_type, paste_hash in paste_hashes.items(): r_serv1.set(paste_hash, index) - r_serv1.sadd("HASHS_"+hash_type, paste_hash) + #bad hash + if paste_hash == '': + print('bad Hash: ' + hash_type) + else: + r_serv1.sadd("HASHS_"+hash_type, paste_hash) ##################### Similarity found ####################### @@ -174,10 +178,11 @@ if dupl != []: dupl = list(dupl) PST.__setattr__("p_duplicate", dupl) - PST.save_attribute_redis("p_duplicate", dupl) - PST.save_others_pastes_attribute_duplicate("p_duplicate", dupl) + PST.save_attribute_duplicate(dupl) + PST.save_others_pastes_attribute_duplicate(dupl) publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path)) print('{}Detected {}'.format(to_print, len(dupl))) + print('') y = time.time() diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 974938a6..aca72e8e 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -110,8 +110,6 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Duplicates" bash -c './Duplicates.py; read x' sleep 0.1 - screen -S "Script_AIL" -X screen -t "Attributes" bash -c './Attributes.py; read x' - sleep 0.1 screen -S "Script_AIL" -X screen -t "Lines" bash -c './Lines.py; read x' sleep 0.1 screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x' diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index 332981f9..ccaf3400 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -76,6 +76,11 @@ def __init__(self, p_path): port=cfg.getint("Redis_Data_Merging", "port"), db=cfg.getint("Redis_Data_Merging", "db"), decode_responses=True) + self.store_duplicate = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) self.p_path = p_path self.p_name = os.path.basename(self.p_path) @@ -272,9 +277,9 @@ def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10): return False, var def _get_p_duplicate(self): - self.p_duplicate = self.store.hget(self.p_path, "p_duplicate") + self.p_duplicate = self.store_duplicate.smembers('dup:'+self.p_path) if self.p_duplicate is not None: - return self.p_duplicate + return list(self.p_duplicate) else: return '[]' @@ -323,27 +328,20 @@ def save_attribute_redis(self, attr_name, value): else: self.store.hset(self.p_path, attr_name, json.dumps(value)) - def save_others_pastes_attribute_duplicate(self, attr_name, list_value): + def save_attribute_duplicate(self, value): + """ + Save an attribute as a field + """ + for tuple in value: + self.store_duplicate.sadd('dup:'+self.p_path, tuple) + + def save_others_pastes_attribute_duplicate(self, list_value): """ Save a new duplicate on others pastes """ for hash_type, path, percent, date in list_value: - #get json - json_duplicate = self.store.hget(path, attr_name) - #json save on redis - if json_duplicate is not None: - list_duplicate = (json.loads(json_duplicate)) - # avoid duplicate, a paste can be send by multiples modules - to_add = [hash_type, self.p_path, percent, date] - if to_add not in list_duplicate: - list_duplicate.append(to_add) - self.store.hset(path, attr_name, json.dumps(list_duplicate)) - - else: - # create the new list - list_duplicate = [[hash_type, self.p_path, percent, date]] - self.store.hset(path, attr_name, json.dumps(list_duplicate)) - + to_add = [hash_type, self.p_path, percent, date] + self.store_duplicate.sadd('dup:'+path,to_add) def _get_from_redis(self, r_serv): ans = {} diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 21eb264c..1eec715d 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -152,6 +152,11 @@ host = localhost port = 6382 db = 6 +[ARDB_Metadata] +host = localhost +port = 6382 +db = 7 + [Url] cc_critical = DE diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index f9e7aef4..41745f21 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -65,6 +65,18 @@ db=cfg.getint("Redis_Paste_Name", "db"), decode_responses=True) +r_serv_tags = redis.StrictRedis( + host=cfg.get("ARDB_Tags", "host"), + port=cfg.getint("ARDB_Tags", "port"), + db=cfg.getint("ARDB_Tags", "db"), + decode_responses=True) + +r_serv_metadata = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) + # VARIABLES # max_preview_char = int(cfg.get("Flask", "max_preview_char")) # Maximum number of character to display in the tooltip max_preview_modal = int(cfg.get("Flask", "max_preview_modal")) # Maximum number of character to display in the modal diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index aea0fa08..3a3be9be 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -18,6 +18,7 @@ app = Flask_config.app cfg = Flask_config.cfg r_serv_pasteName = Flask_config.r_serv_pasteName +r_serv_metadata = Flask_config.r_serv_metadata max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal DiffMaxLineLength = Flask_config.DiffMaxLineLength @@ -38,20 +39,22 @@ def showpaste(content_range): p_mime = paste.p_mime p_lineinfo = paste.get_lines_info() p_content = paste.get_p_content() - p_duplicate_full_list = json.loads(paste._get_p_duplicate()) + p_duplicate_str_full_list = paste._get_p_duplicate() + + p_duplicate_full_list = [] p_duplicate_list = [] p_simil_list = [] p_date_list = [] p_hashtype_list = [] - for dup_list in p_duplicate_full_list: + for dup_list in p_duplicate_str_full_list: + dup_list = dup_list[1:-1].replace('\'', '').replace(' ', '').split(',') if dup_list[0] == "tlsh": dup_list[2] = 100 - int(dup_list[2]) else: - print('dup_list') - print(dup_list) dup_list[2] = int(dup_list[2]) + p_duplicate_full_list.append(dup_list) #p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True) @@ -69,8 +72,8 @@ def showpaste(content_range): comp_vals.append(p_duplicate_full_list[i][2]) dup_list_removed.append(i) - hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types) - comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals) + #hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types) + #comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals) if len(p_duplicate_full_list[dup_list_index]) > 3: try: @@ -80,7 +83,7 @@ def showpaste(content_range): date_paste = str(p_duplicate_full_list[dup_list_index][3]) else: date_paste = "No date available" - new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals, date_paste]) + new_dup_list.append([hash_types, p_duplicate_full_list[dup_list_index][1], comp_vals, date_paste]) # Create the list to pass to the webpage for dup_list in new_dup_list: