Skip to content

Commit

Permalink
fix Duplicate, save list of duplicates on disk + prevent empty hash c…
Browse files Browse the repository at this point in the history
…reation
  • Loading branch information
Terrtia committed May 15, 2018
1 parent 225fe76 commit f66a528
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 31 deletions.
11 changes: 8 additions & 3 deletions bin/Duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,11 @@
# Adding hashes in Redis
for hash_type, paste_hash in paste_hashes.items():
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS_"+hash_type, paste_hash)
#bad hash
if paste_hash == '':
print('bad Hash: ' + hash_type)
else:
r_serv1.sadd("HASHS_"+hash_type, paste_hash)

##################### Similarity found #######################

Expand All @@ -174,10 +178,11 @@
if dupl != []:
dupl = list(dupl)
PST.__setattr__("p_duplicate", dupl)
PST.save_attribute_redis("p_duplicate", dupl)
PST.save_others_pastes_attribute_duplicate("p_duplicate", dupl)
PST.save_attribute_duplicate(dupl)
PST.save_others_pastes_attribute_duplicate(dupl)
publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path))
print('{}Detected {}'.format(to_print, len(dupl)))
print('')

y = time.time()

Expand Down
2 changes: 0 additions & 2 deletions bin/LAUNCH.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,6 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "Duplicates" bash -c './Duplicates.py; read x'
sleep 0.1
screen -S "Script_AIL" -X screen -t "Attributes" bash -c './Attributes.py; read x'
sleep 0.1
screen -S "Script_AIL" -X screen -t "Lines" bash -c './Lines.py; read x'
sleep 0.1
screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x'
Expand Down
36 changes: 17 additions & 19 deletions bin/packages/Paste.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ def __init__(self, p_path):
port=cfg.getint("Redis_Data_Merging", "port"),
db=cfg.getint("Redis_Data_Merging", "db"),
decode_responses=True)
self.store_duplicate = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)

self.p_path = p_path
self.p_name = os.path.basename(self.p_path)
Expand Down Expand Up @@ -272,9 +277,9 @@ def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
return False, var

def _get_p_duplicate(self):
self.p_duplicate = self.store.hget(self.p_path, "p_duplicate")
self.p_duplicate = self.store_duplicate.smembers('dup:'+self.p_path)
if self.p_duplicate is not None:
return self.p_duplicate
return list(self.p_duplicate)
else:
return '[]'

Expand Down Expand Up @@ -323,27 +328,20 @@ def save_attribute_redis(self, attr_name, value):
else:
self.store.hset(self.p_path, attr_name, json.dumps(value))

def save_others_pastes_attribute_duplicate(self, attr_name, list_value):
def save_attribute_duplicate(self, value):
"""
Save an attribute as a field
"""
for tuple in value:
self.store_duplicate.sadd('dup:'+self.p_path, tuple)

def save_others_pastes_attribute_duplicate(self, list_value):
"""
Save a new duplicate on others pastes
"""
for hash_type, path, percent, date in list_value:
#get json
json_duplicate = self.store.hget(path, attr_name)
#json save on redis
if json_duplicate is not None:
list_duplicate = (json.loads(json_duplicate))
# avoid duplicate, a paste can be send by multiples modules
to_add = [hash_type, self.p_path, percent, date]
if to_add not in list_duplicate:
list_duplicate.append(to_add)
self.store.hset(path, attr_name, json.dumps(list_duplicate))

else:
# create the new list
list_duplicate = [[hash_type, self.p_path, percent, date]]
self.store.hset(path, attr_name, json.dumps(list_duplicate))

to_add = [hash_type, self.p_path, percent, date]
self.store_duplicate.sadd('dup:'+path,to_add)

def _get_from_redis(self, r_serv):
ans = {}
Expand Down
5 changes: 5 additions & 0 deletions bin/packages/config.cfg.sample
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@ host = localhost
port = 6382
db = 6

[ARDB_Metadata]
host = localhost
port = 6382
db = 7

[Url]
cc_critical = DE

Expand Down
12 changes: 12 additions & 0 deletions var/www/modules/Flask_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@
db=cfg.getint("Redis_Paste_Name", "db"),
decode_responses=True)

r_serv_tags = redis.StrictRedis(
host=cfg.get("ARDB_Tags", "host"),
port=cfg.getint("ARDB_Tags", "port"),
db=cfg.getint("ARDB_Tags", "db"),
decode_responses=True)

r_serv_metadata = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)

# VARIABLES #
max_preview_char = int(cfg.get("Flask", "max_preview_char")) # Maximum number of character to display in the tooltip
max_preview_modal = int(cfg.get("Flask", "max_preview_modal")) # Maximum number of character to display in the modal
Expand Down
17 changes: 10 additions & 7 deletions var/www/modules/showpaste/Flask_showpaste.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
app = Flask_config.app
cfg = Flask_config.cfg
r_serv_pasteName = Flask_config.r_serv_pasteName
r_serv_metadata = Flask_config.r_serv_metadata
max_preview_char = Flask_config.max_preview_char
max_preview_modal = Flask_config.max_preview_modal
DiffMaxLineLength = Flask_config.DiffMaxLineLength
Expand All @@ -38,20 +39,22 @@ def showpaste(content_range):
p_mime = paste.p_mime
p_lineinfo = paste.get_lines_info()
p_content = paste.get_p_content()
p_duplicate_full_list = json.loads(paste._get_p_duplicate())
p_duplicate_str_full_list = paste._get_p_duplicate()

p_duplicate_full_list = []
p_duplicate_list = []
p_simil_list = []
p_date_list = []
p_hashtype_list = []


for dup_list in p_duplicate_full_list:
for dup_list in p_duplicate_str_full_list:
dup_list = dup_list[1:-1].replace('\'', '').replace(' ', '').split(',')
if dup_list[0] == "tlsh":
dup_list[2] = 100 - int(dup_list[2])
else:
print('dup_list')
print(dup_list)
dup_list[2] = int(dup_list[2])
p_duplicate_full_list.append(dup_list)

#p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)

Expand All @@ -69,8 +72,8 @@ def showpaste(content_range):
comp_vals.append(p_duplicate_full_list[i][2])
dup_list_removed.append(i)

hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
#hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
#comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)

if len(p_duplicate_full_list[dup_list_index]) > 3:
try:
Expand All @@ -80,7 +83,7 @@ def showpaste(content_range):
date_paste = str(p_duplicate_full_list[dup_list_index][3])
else:
date_paste = "No date available"
new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals, date_paste])
new_dup_list.append([hash_types, p_duplicate_full_list[dup_list_index][1], comp_vals, date_paste])

# Create the list to pass to the webpage
for dup_list in new_dup_list:
Expand Down

0 comments on commit f66a528

Please sign in to comment.