-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #59 from dgraph-io/raphael/social
add social data and generator
- Loading branch information
Showing
4 changed files
with
194 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
|
||
Dataset created to support the blog post about variable propagation on 'social' use cases. | ||
|
||
https://dgraph.io/blog/post/20240923-variable-propgation/ | ||
|
||
|
||
The RDF file `contacts.rdf.gz` has been generated for 10000 users with the script generate.py. | ||
|
||
|
||
Start the dgraph container with the following command | ||
|
||
> docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-dev dgraph/standalone:latest | ||
Copy the files to the mounted directory so that they are seen in Docker. | ||
|
||
> cp contacts.rdf.gz <local path to /dgraph-data> | ||
> cp contacts.schema <local path to /dgraph-data> | ||
Use dgraph live command in the docker instance | ||
|
||
> docker exec -it dgraph-dev dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema | ||
You cat get some usernames | ||
|
||
```graphql | ||
{ | ||
user(func:has(username), first:5) { | ||
username | ||
} | ||
``` | ||
|
||
and test the queries from the blog post | ||
## mutual 'follows' | ||
```graphql | ||
{ | ||
|
||
userA as var(func: eq(username, "barbara10")) { | ||
# use a named variable userA to be able to exclude this node later in the query | ||
c as math(1) # start c =1 on user A Node | ||
follows_of_userA as follows { | ||
# c is propagated, each follow is reached one time so c =1 for every follow | ||
~follows @filter(NOT uid(userA)) { | ||
# ~follows is the reverse relationship | ||
# users at this level are reached by all the common follows, | ||
# c = sum all path = count of common follows | ||
# keep the value in a variable, | ||
# in Dgraph a variable is a map uid -> value, so we have the count for every target | ||
mutual_follows as math(c) | ||
} | ||
} | ||
} | ||
|
||
target_user(func: uid(mutual_follows), orderdesc: val(mutual_follows), first:1) { | ||
username | ||
mutual_follows_count: val(mutual_follows) | ||
mutual_follows: follows @filter(uid(follows_of_userA)) { | ||
username | ||
} | ||
} | ||
} | ||
``` | ||
|
||
## mutual 'contacts' | ||
```graphql | ||
{ | ||
var(func: eq(username, "barbara10")) { | ||
c as math(1) | ||
userA_phone_number as ~belongs_to { | ||
userA_contacts as has_in_contacts { | ||
~has_in_contacts @filter(NOT uid(userA_phone_number)) { | ||
belongs_to{ | ||
mutual_contacts as Math(c) | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
||
target_user(func: uid(mutual_contacts), orderdesc: val(mutual_contacts), first: 1) { | ||
username | ||
mutual_contact_count:val(mutual_contacts) | ||
phone:~belongs_to { | ||
phone_number | ||
mutual_contacts: has_in_contacts @filter(uid(userA_contacts)) { | ||
phone_number | ||
belongs_to { | ||
username | ||
} | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
|
||
## computing a complex score | ||
|
||
```graphql | ||
{ | ||
userA as var(func: eq(username, "barbara10")) { | ||
c as math(1) # start c =1 on user A Node | ||
# first block to compute mutual follows using variable propagation | ||
follows { | ||
~follows @filter(NOT uid(userA)) { | ||
mutual_follows as math(c) | ||
} | ||
} | ||
# second block to compute mutual contacts using same variable ! | ||
# different path. | ||
userA_phone_number as ~belongs_to { | ||
has_in_contacts { | ||
~has_in_contacts @filter(NOT uid(userA_phone_number)) { | ||
belongs_to{ | ||
mutual_contacts as Math(c) | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
# compute a score using the formula | ||
var(func: uid(mutual_follows, mutual_contacts)) { | ||
score as math(0.4 * mutual_follows + 0.6 * mutual_contacts) | ||
} | ||
# get target info | ||
target(func: uid(score), orderdesc: val(score), first: 1) { | ||
username | ||
score: val(score) | ||
count_mutual_follows: val(mutual_follows) | ||
count_mutual_contacts: val (mutual_contacts) | ||
} | ||
} | ||
``` |
Git LFS file not shown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
phone_number: string @index(hash) . | ||
username: string @index(hash) . | ||
has_in_contacts: [uid] @reverse . | ||
belongs_to: uid @reverse . | ||
follows: [uid] @reverse . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/usr/bin/python | ||
|
||
from faker import Faker | ||
import pandas as pd | ||
import gzip | ||
import sys | ||
from random import randint | ||
|
||
|
||
faker = Faker() | ||
def generate_data(size=500): | ||
# return a dataframe with user_name and phone number | ||
phones = [f'{faker.unique.msisdn()[4:]}' for i in range(size)] | ||
names = [faker.unique.user_name() for i in range(size)] | ||
df = pd.DataFrame({'user_name': names, 'phone_number': phones}) | ||
return df | ||
|
||
|
||
def dataframe_to_rdf(data, filehandle = sys.stdout): | ||
for _, row in data.iterrows(): | ||
# add users and phone numbers to the rdf file | ||
rdf= "" | ||
rdf += "<_:{}> <phone_number> \"{}\" .\n".format(row['phone_number'],row['phone_number']) | ||
rdf += "<_:{}> <username> \"{}\" .\n".format(row['user_name'],row['user_name']) | ||
rdf += "<_:{}> <belongs_to> <_:{}> .\n".format(row['phone_number'],row['user_name']) | ||
# add follows relationship | ||
# get a random number of people to follow from the dataframe | ||
follows = data.sample(n=randint(5, 100)) | ||
for _, row_target in follows.iterrows(): | ||
if (row['user_name'] != row_target['user_name']): | ||
rdf += "<_:{}> <follows> <_:{}> .\n".format(row['user_name'],row_target['user_name']) | ||
# add contacts relationship | ||
contacts = data.sample(n=randint(5, 100)) | ||
for _, row_target in contacts.iterrows(): | ||
if (row['phone_number'] != row_target['phone_number']): | ||
rdf += "<_:{}> <has_in_contacts> <_:{}> .\n".format(row['phone_number'],row_target['phone_number']) | ||
filehandle.write(rdf) | ||
return | ||
|
||
|
||
data = generate_data(10000) | ||
# data.to_csv("products_with_embedding.csv.gz",index=False,compression='gzip',header=True) | ||
# gzip file must use wt for write text | ||
with gzip.open("./contacts.rdf.gz","wt") as f: | ||
dataframe_to_rdf(data, f) | ||
|
||
|
||
# ## load data set | ||
# Start the dgraph container with the following command | ||
# docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-v24 dgraph/standalone:latest | ||
# cp contacts.rdf.gz <local path to /dgraph-data> | ||
# cp contacts.schema <local path to /dgraph-data> | ||
# docker exec -it dgraph-v24 dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema |