Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

For creating and using indexing in gstudio #1201

Open
wants to merge 6 commits into
base: mongokit
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions gnowsys-ndf/create_indexes_mongoshell.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*run this file from the terminal by typing the command mongo<creating_indexes_mongoshell.js this will run the script and create indexes on the below fields and when ever u want to include new fields make the update in the respective class in /gnowsys-ndf/ndf/models.py and include it in this file*/

conn=new Mongo();
db=conn.getDB("studio-dev");
db.Nodes.createIndex({'_type':1,'name':1})
db.Nodes.createIndex({'_type':1,'_id':1})
//db.Nodes.createIndex({'content':1})
//db.Nodes.createIndex({'tags':1})
//db.Nodes.createIndex({'status':1})
//db.Nodes.createIndex({'collection_set':1})
//db.Nodes.createIndex({'type_of':1})
//db.Nodes.createIndex({'member_of':1})
//db.Nodes.createIndex({'attribute_set':1})
//db.Nodes.createIndex({'relation_set':1})
db.Triples.createIndex({'_type':1,'name':1})
//db.Triples.createIndex({'object_value':1})
//db.Triples.createIndex({'status':1})
//db.Triples.createIndex({'right_subject':1})
db.Triples.createIndex({'_type':1,'subject':1,'atribute_type':1})
db.Triples.createIndex({'_type':1,'subject':1,'relation_type':1})
db.Nodes.createIndex({'member_of':1,'status':1,'last_update':1})
68 changes: 63 additions & 5 deletions gnowsys-ndf/gnowsys_ndf/ndf/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,39 @@ class Node(DjangoDocument):
'ip_address':basestring}],
'snapshot':dict
}

indexes=[
{
'fields':['_type','name'],
},
# {
# 'fields':['member_of','group_set'],
# },
{
'fields':['_type','_id'],
},
{
'fields':['member_of','status','last_update']
}
# {
# 'fields':['content'],
# },
# {
# 'fields':['tags'],
# },
# {
# 'fields':['collection_set'],
# },
# {
# 'fields':['type_of'],
# },
# {
# 'fields':['member_of'],
# },
# {
# 'fields':['status']
# }
]

required_fields = ['name', '_type'] # 'group_set' to be included
# here after the default
# 'Administration' group is
Expand Down Expand Up @@ -1106,8 +1138,15 @@ class GSystem(Node):
'annotations': [dict], # List of json files for annotations on the page
'license': basestring # contains license/s in string format
}

use_dot_notation = True
#indexes=[
#{
#'fields':['attribute_set'],
#},
#{
#'fields':['relation_set'],
#}
#]
use_dot_notation = True


@connection.register
Expand Down Expand Up @@ -1597,7 +1636,17 @@ class Triple(DjangoDocument):
'lang': basestring, # Put validation for standard language codes
'status': STATUS_CHOICES_TU
}

indexes=[
{
'fields':['_type','name'],
},
{
'fields':['_type','subject'],
},
# {
#'fields':['status'],
#}
]
required_fields = ['name', 'subject']
use_dot_notation = True
use_autorefs = True
Expand Down Expand Up @@ -1810,6 +1859,11 @@ class GAttribute(Triple):
}

required_fields = ['attribute_type', 'object_value']
indexes=[
{
'fields':['attribute_type'],
}
]
use_dot_notation = True
use_autorefs = True # To support Embedding of Documents

Expand All @@ -1823,7 +1877,11 @@ class GRelation(Triple):
# ObjectId's of GSystems Class / List of list of ObjectId's of GSystem Class
'right_subject': OR(ObjectId, list)
}

indexes=[
{
'fields':['relation_type'],
}
]
required_fields = ['relation_type', 'right_subject']
use_dot_notation = True
use_autorefs = True # To support Embedding of Documents
Expand Down
13 changes: 9 additions & 4 deletions gnowsys-ndf/gnowsys_ndf/ndf/templatetags/ndf_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -1710,7 +1710,6 @@ def group_type_info(groupid,user=0):

return group_type


@get_execution_time
@register.assignment_tag
def user_access_policy(node, user):
Expand All @@ -1732,7 +1731,11 @@ def user_access_policy(node, user):
string value (allow/disallow), i.e. whether user is allowed or not!
"""
user_access = False

group_name, group_id = get_group_name_id(node)
cache_key='access'+str(group_id) #Low level API cache implemented in this function
cache_result=cache.get(cache_key)
if cache_result:
return cache_result
try:
# Please make a note, here the order in which check is performed is IMPORTANT!

Expand All @@ -1741,7 +1744,6 @@ def user_access_policy(node, user):

else:
# group_node = node_collection.one({'_type': {'$in': ["Group", "Author"]}, '_id': ObjectId(node)})
group_name, group_id = get_group_name_id(node)
group_node = node_collection.one({"_id": ObjectId(group_id)})

if user.id == group_node.created_by:
Expand All @@ -1760,15 +1762,18 @@ def user_access_policy(node, user):
user_access = False

if user_access:
cache.set(cache_key,"allow")
return "allow"

else:
cache.set(cache_key,"disallow")
return "disallow"

except Exception as e:
error_message = "\n UserAccessPolicyError: " + str(e) + " !!!\n"
raise Exception(error_message)

#implemeted cache in this method

@get_execution_time
@register.assignment_tag
def resource_info(node):
Expand Down
45 changes: 45 additions & 0 deletions indexing_readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
For indexing to be created and used we have to follow two steps:

1)Index usage:
This is straight forward where we go to file where the class on databases is defined (In gstudio it is models.py) and below structures variable (where we define the various fields present along with their properties) we add another variable called indexes and list the fields on which we want to perform indexing.

For Example-

>>> class MyDoc(Document):
... structure = {
... 'standard':unicode,
... 'other':{
... 'deep':unicode,
... },
... 'notindexed':unicode,
... }
...
... indexes = [
... {
... 'fields':['standard', 'other.deep'],
... 'unique':True,
... },
... ]

In versions before 0.7.1 of mongokit this simple addition would automatically create indexes for the data in mongoDB also. But in later versions this automatic index creation was removed (as people felt that indexes should be created with care directly on the collection). Gstudio uses Mongokit version 0.9.1.1 so this above addition only enables the database class to use the indexes if they are present in the database. We have to now manually create the indexes in mongoDB through mongo shell commands (This is infact told by mongokit as a deprecation warning that indexing is no longer automatic and we have to do it manually).

2)Index creation:
We can actually use createIndex() command directly in mongoshell individually on various databases for required fields to create indexes. But it will reduce relocation, so we wrote the commands in a js file and then ran the script.

For Example-

test.js =>
conn=new Mongo()
db=conn.getDB("studio-dev")
db.Nodes.createIndex({'_type':1,'name':1})
//just keep adding these commands for creating Indexes in desired databases to required fields

$mongo<test.js #written on the terminal

As written in mongoDB documentation, it is preferable to use createIndex() rather than ensureIndex() command.

3)Index changes:
In future if more fields are to be indexed, then we can append them to the js file (present in gnowsys-ndf folder as 'create_indexes_mongoshell.js') and run it again (Point 2)(if index is already present then the instruction does nothing) and also make necessary changes to the index variable present in the database class (models.py) (Point 1). If no indexing changes are done then running the script once will suffice.



107 changes: 107 additions & 0 deletions optimize_readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
1)Creating temporary variables for faster lookup:
Lists are very commonly used data structure for storing mutating information in python. While using lists we frequently use append methods on them. This append function when combined with loops is a costly operation but with a fairly simple hack the time can be reduced.
def func1():
lst1=[]
lst2=[]
for i in range(500):
lst1.append(i)
lst2.append(i+500)
for j in lst2:
lst1.append(j)

def func2():
lst1=[]
lst2=[]
l1_append_temp=lst1.append #a tempoary variable
l2_append_temp=lst2.append #lookup for append already done
for i in range(500):
l1_append_temp(i)
l2_append_temp(i+500)
for j in lst2:
l1_append_temp(j)

Using timeit library of python we timed the functions got the following results:
func1-0.048635005950927734
func2-0.032353162765502930
(Note that here we are dealing with relatively small data so the time difference may be small but with big data the difference can be huge)

Here for each loop we look for the append function of the list and then use the function but by using the temporary variable (where we store the lookup early on) we skip the first step.

We can use the same technique for commonly used queries on databases present within loops.
Eg- dB_find_temp=node_collection.find

(Note that using this when there are no loops present gives no time advantage.Infact using this without loops can lead to loss of readability and also the problem of many local variables being present)

2)Multiprocessing library of python:
Because of GIL (Global Interpreter Lock) working with threads in python is not very easy as was the case with old languages like C.
GIL of python interpreter synchronizes the execution of threads so that only one thread can execute at a time even if the computer has multiple cores and can run threads simultaneously. Still, using multiprocessing library allows the programmer some leeway where he can use the multiple cores to some extent.(But note that using this library creates a big software overhead and thus must be used only when dealing with big loops).In python also the old rule that 'multiprocessing must be used only when dealing with independent objects' applies.

def func3():
for each_gapp in already_selected_gapps:
gapp_name = each_gapp["name"]
if gapp_name in gapps_list:
gapps_list_remove(gapp_name)

import multiprocessing
def func4():
processes=[]
n1=len(already_selected_gapps)
lst1=already_selected_gapps
x=mp.cpu_count()
n2=n1/x
for i in x:#dividing the list (of independent elements) by number of cores and passing each partition to one thread
processes.append(mp.Process(target=multi_,args=(lst1[i*n2:(i+1)*n2])))
for i in x:
processes[i].start()
for i in x:
processes[i].join()

def multi_(lst):#the logic of the loop must be put in a function so that each thread can use it
for each_gapp in lst:
gapp_name = each_gapp["name"]

if gapp_name in gapps_list:
gapps_list_remove(gapp_name)


3)List comprehensions:
The best way to visualize list comprehensions is thinking of them as sets in set-builder form. This is an excellent alternative to loops dealing with lists as it results in faster computations.

A={x^2:x is in (1,2,3)}={1,4,9}

def func5():
lst=[]
lst2=[]
for i in range(500):
lst2.append(i)
for i in lst2:
lst.append(i*i)

def func6():
lst2=[]
lst2_append_temp=lst2.append
for i in range(500):
lst2_append_temp(i)
lst=[i*i for i in lst2] #see the similarity with set builder form

Using timeit library of python we timed the functions got the following results:
func5-0.047894954681396484
func6-0.021952867507934570
(Note that here we are dealing with relatively small data so the time difference may be small but with big data the difference can be huge)

The general format of list comprehension is:
[expression for item in old_list if condition]

This is equivalent to:
for item in old_list:
if condition:
expression

Eg-
new_lst = [x**2 for x in old_lst if x%2==0]
is equivalent to
new_lst=[]
for x in old_lst:
if x%2==0:
new_lst.append(x**2)
Note that here old_lst must be different from new_lst and also that new_lst must be empty (if not new_lst will be replaced)